In [372]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

## Feature engineering 
Feature engineering is the process of transforming raw data into features that are suitable for machine learning models. In other words, it is the process of selecting, extracting, and transforming the most relevant features from the available data to build more accurate and efficient machine learning models.

In [351]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [352]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [353]:
#missing values
df['age'] = df['age'].fillna(df['age'].median())
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])

In [354]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      0
alive            0
alone            0
dtype: int64

In [355]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  891 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [356]:
def gender(s):
    if s == "male":
        return 1 
    else:
        return 2
#new feature
df["sex"] = df["sex"].map(gender) 

df['family_size'] = df['sibsp'] + df['parch']
age = [0, 12, 18, 60, 80]
labels = ['Child', 'Teenager', 'Adult', 'Senior']
df['age_group'] = pd.cut(df['age'], bins=age, labels=labels)
df = pd.get_dummies(df, columns=['sex' ,'embarked', 'age_group'])
df=df.drop(["alone", "alive", "embark_town", "who", "class", "adult_male", "deck"], axis=1)

In [357]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   survived            891 non-null    int64  
 1   pclass              891 non-null    int64  
 2   age                 891 non-null    float64
 3   sibsp               891 non-null    int64  
 4   parch               891 non-null    int64  
 5   fare                891 non-null    float64
 6   family_size         891 non-null    int64  
 7   sex_1               891 non-null    bool   
 8   sex_2               891 non-null    bool   
 9   embarked_C          891 non-null    bool   
 10  embarked_Q          891 non-null    bool   
 11  embarked_S          891 non-null    bool   
 12  age_group_Child     891 non-null    bool   
 13  age_group_Teenager  891 non-null    bool   
 14  age_group_Adult     891 non-null    bool   
 15  age_group_Senior    891 non-null    bool   
dtypes: bool(

## Feature Scaling: Standardization
Feature scaling is a preprocessing technique used in machine learning to ensure that numerical features are on the same scale. This is particularly important for algorithms sensitive to the magnitude of feature values, such as gradient-based methods (e.g., linear regression, logistic regression, support vector machines, and neural networks).

What is Standardization?
Standardization (also known as Z-score normalization) is a feature scaling method that transforms the values of a feature to have


In [358]:
df = pd.read_csv('Social_Network_Ads.csv')
df=df.iloc[:,1:]
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [359]:
X = df.iloc[:, 1:]
y = df["Purchased"]
X_train , X_test, Y_train, Y_test = train_test_split(X, y, train_size = 0.8 , random_state= 42)                                                    
X_train.shape, X_test.shape

((80, 3), (320, 3))

## StandardScaler

In [360]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [361]:
scaler.mean_

array([3.72000e+01, 6.79625e+04, 3.12500e-01])

In [362]:
X_train


Unnamed: 0,Age,EstimatedSalary,Purchased
212,59,42000,0
295,36,63000,0
364,42,104000,1
251,37,52000,0
390,48,33000,1
...,...,...,...
71,24,27000,0
106,26,35000,0
270,43,133000,0
348,39,77000,0


In [363]:
X_train_scaled

array([[ 2.28388099, -0.79691098, -0.67419986],
       [-0.12571822, -0.15232242, -0.67419986],
       [ 0.50287288,  1.10616002,  1.4832397 ],
       [-0.02095304, -0.48996405, -0.67419986],
       [ 1.13146398, -1.07316323,  1.4832397 ],
       [ 2.28388099, -1.195942  ,  1.4832397 ],
       [-0.02095304,  0.33879268,  1.4832397 ],
       [ 0.29334251, -0.2137118 , -0.67419986],
       [ 1.23622916,  0.18531921, -0.67419986],
       [-1.27813523,  0.67643431, -0.67419986],
       [-0.75430932, -1.62566771, -0.67419986],
       [ 0.29334251,  0.30809799, -0.67419986],
       [-1.38290042,  0.49226615, -0.67419986],
       [ 0.08381215, -0.55135343, -0.67419986],
       [ 0.81716843,  1.93491675,  1.4832397 ],
       [-1.69719597,  0.12392983, -0.67419986],
       [-0.2304834 , -1.38011016, -0.67419986],
       [-0.2304834 , -1.47219424, -0.67419986],
       [-0.64954414,  0.64573962, -0.67419986],
       [-0.75430932,  0.36948737, -0.67419986],
       [ 1.0266988 , -0.64343752, -0.674

In [364]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
np.round(X_train.describe(), 1)

Unnamed: 0,Age,EstimatedSalary,Purchased
count,80.0,80.0,80.0
mean,37.2,67962.5,0.3
std,9.6,32784.5,0.5
min,19.0,15000.0,0.0
25%,30.0,42750.0,0.0
50%,37.0,67000.0,0.0
75%,45.0,85250.0,1.0
max,60.0,141000.0,1.0


In [365]:
np.round(X_train_scaled.describe(), 1)

Unnamed: 0,Age,EstimatedSalary,Purchased
count,80.0,80.0,80.0
mean,-0.0,-0.0,0.0
std,1.0,1.0,1.0
min,-1.9,-1.6,-0.7
25%,-0.8,-0.8,-0.7
50%,-0.0,-0.0,-0.7
75%,0.8,0.5,1.5
max,2.4,2.2,1.5


In [366]:
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


## Ordinal Encoder
Encoding Categorical Data
In machine learning, categorical data refers to variables that contain label values rather than numerical values. These variables can represent categories or labels, such as "Red", "Blue", "Green" for a color feature, or "Low", "Medium", "High" for a satisfaction rating. Machine learning models typically require numerical input, so categorical data must be encoded into a format that models can process.

There are various ways to encode categorical variables, and two common techniques are Ordinal Encoding and Label Encoding.



In [367]:
df.EstimatedSalary.unique()

array([ 19000,  20000,  43000,  57000,  76000,  58000,  84000, 150000,
        33000,  65000,  80000,  52000,  86000,  18000,  82000,  25000,
        26000,  28000,  29000,  22000,  49000,  41000,  23000,  30000,
        74000, 137000,  16000,  44000,  90000,  27000,  72000,  31000,
        17000,  51000, 108000,  15000,  79000,  54000, 135000,  89000,
        32000,  83000,  55000,  48000, 117000,  87000,  66000, 120000,
        63000,  68000, 113000, 112000,  42000,  88000,  62000, 118000,
        85000,  81000,  50000, 116000, 123000,  73000,  37000,  59000,
       149000,  21000,  35000,  71000,  61000,  75000,  53000, 107000,
        96000,  45000,  47000, 100000,  38000,  69000, 148000, 115000,
        34000,  60000,  70000,  36000,  39000, 134000, 101000, 130000,
       114000, 142000,  78000, 143000,  91000, 144000, 102000, 126000,
       133000, 147000, 104000, 146000, 122000,  97000,  95000, 131000,
        77000, 125000, 106000, 141000,  93000, 138000, 119000, 105000,
      

In [368]:
def age_to_category(age):
    if age >= 60:
        return 2  
    elif age >= 30:
        return 1  
    else:
        return 0  
df["Age"] = df["Age"].map(age_to_category) 
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,0,19000,0
1,Male,1,20000,0
2,Female,0,43000,0
3,Female,0,57000,0
4,Male,0,76000,0
...,...,...,...,...
395,Female,1,41000,1
396,Male,1,23000,1
397,Female,1,20000,1
398,Male,1,33000,0


## Outliers
An outlier is a data point that significantly deviates from the rest of the data. It can be either much higher or much lower than the other data points, and its presence can have a significant impact on the results of machine learning algorithms. They can be caused by measurement or execution errors. The analysis of outlier data is referred to as outlier analysis or outlier mining.

In [369]:
print("Mean value of cgpa",df['EstimatedSalary'].mean())
print("Std value of cgpa",df['EstimatedSalary'].std())
print("Min value of cgpa",df['EstimatedSalary'].min())
print("Max value of cgpa",df['EstimatedSalary'].max())

Mean value of cgpa 69742.5
Std value of cgpa 34096.960282424785
Min value of cgpa 15000
Max value of cgpa 150000


In [348]:
print("Highest allowed",df['EstimatedSalary'].mean() + 3*df['EstimatedSalary'].std())
print("Lowest allowed",df['EstimatedSalary'].mean() - 3*df['EstimatedSalary'].std())

Highest allowed 172033.38084727435
Lowest allowed -32548.380847274355


In [347]:
# Finding the outliers
df[(df['EstimatedSalary'] > 172033.38084727435) | (df['EstimatedSalary'] < 32548.380847274355)]

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,0,19000,0
1,Male,1,20000,0
13,Male,1,18000,0
16,Male,1,25000,1
17,Male,1,26000,1
...,...,...,...,...
379,Female,1,23000,1
383,Male,1,28000,1
391,Male,1,23000,1
396,Male,1,23000,1


In [349]:
new_df = df[(df['EstimatedSalary'] < 172033.38084727435) & (df['EstimatedSalary'] > 32548.380847274355)]
new_df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
2,Female,0,43000,0
3,Female,0,57000,0
4,Male,0,76000,0
5,Male,0,58000,0
6,Female,0,84000,0
...,...,...,...,...
393,Male,2,42000,1
394,Female,1,59000,0
395,Female,1,41000,1
398,Male,1,33000,0


## Pipeline


In [370]:
obj = Pipeline([("std", StandardScaler()), ("LogReg", LogisticRegression())], verbose=True)

In [371]:
obj.fit(X_train, y_train)

[Pipeline] ............... (step 1 of 2) Processing std, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing LogReg, total=   0.0s


In [373]:
r2_score(y_test, obj.predict(X_test))

1.0