# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

## IMOIRTING LIBRARIES

In [1]:
!pip install ppscore



In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
import ppscore as pps

## Task-1. Data Exploration and Preprocessing:

In [4]:
data=pd.read_csv('adult_with_headers.csv')
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
data.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### •	Apply scaling techniques to numerical features:

#### •	Standard Scaling

In [8]:
numerical_columns=data.select_dtypes(include=['int64','float64']).columns
numerical_columns

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')

In [9]:
standard_scaling=StandardScaler()
standard_data=data.copy()
standard_data[numerical_columns]=standard_scaling.fit_transform(data[numerical_columns])

#### •	Min-Max Scaling

In [10]:
min_max_scaling=MinMaxScaler()
min_max_data=data.copy()
min_max_data[numerical_columns]=min_max_scaling.fit_transform(data[numerical_columns])

### •	Discuss the scenarios where each scaling technique is preferred and why.

##### Basically standard scaling is used when data is uniformly distributed and min max scaling is used when data is not uniformly distributed
##### In Standard Scaling mean will become  as zero and standard deviation will become as one
##### In Min Max Scaling will make all values from 0 to 1 or we can fix any range (x,y)

In [11]:
stand_data=pd.DataFrame(standard_data,columns=numerical_columns)
stand_data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,-2.705915e-17,-1.001625e-16,1.471887e-16,1.309314e-17,1.0169e-16,-1.5493550000000002e-17
std,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.582206,-1.681631,-3.529656,-0.1459205,-0.2166595,-3.19403
25%,-0.7757679,-0.681691,-0.4200596,-0.1459205,-0.2166595,-0.03542945
50%,-0.1159546,-0.1082193,-0.03136003,-0.1459205,-0.2166595,-0.03542945
75%,0.6904838,0.4478765,0.7460392,-0.1459205,-0.2166595,0.3695194
max,3.769612,12.26856,2.300838,13.39458,10.59351,4.742967


In [12]:
min_data=pd.DataFrame(min_max_data,columns=numerical_columns)
min_data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,0.120545,0.605379,0.010777,0.020042,0.402423
std,0.186855,0.071685,0.171515,0.073854,0.092507,0.125994
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.150685,0.071679,0.533333,0.0,0.0,0.397959
50%,0.273973,0.112788,0.6,0.0,0.0,0.397959
75%,0.424658,0.152651,0.733333,0.0,0.0,0.44898
max,1.0,1.0,1.0,1.0,1.0,1.0


## Task-2. Encoding Techniques:

##### •	Apply One-Hot Encoding to categorical variables with less than 5 categories..


##### •	Use Label Encoding for categorical variables with more than 5 categories.


##### •	Discuss the pros and cons of One-Hot Encoding and Label Encoding.

In [13]:
categoriacal_columns=data.select_dtypes(include='object').columns
categoriacal_columns

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

In [14]:
for col in categoriacal_columns:
    a=data[col].unique()
    print(f'THESE ARE UNIQUE IN THIS {col} ','\n' ,f'{a} ','\n')

THESE ARE UNIQUE IN THIS workclass  
 [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']  

THESE ARE UNIQUE IN THIS education  
 [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']  

THESE ARE UNIQUE IN THIS marital_status  
 [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']  

THESE ARE UNIQUE IN THIS occupation  
 [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']  

THESE ARE UNIQUE IN THIS relationship  
 [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']  

THESE ARE

### One-Hot Encoding

In [15]:
var=data[['race','sex','income']]
dummies=pd.get_dummies(var,dtype=int)
dummies

Unnamed: 0,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,0,0,0,0,1,0,1,1,0
1,0,0,0,0,1,0,1,1,0
2,0,0,0,0,1,0,1,1,0
3,0,0,1,0,0,0,1,1,0
4,0,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
32556,0,0,0,0,1,1,0,1,0
32557,0,0,0,0,1,0,1,0,1
32558,0,0,0,0,1,1,0,1,0
32559,0,0,0,0,1,0,1,1,0


### Label Encoding

In [16]:
var1=data.drop(['race','sex','income'],axis=1)

le=LabelEncoder()
for column in var1.columns:
    var1[column] = le.fit_transform(var1[column])
var1

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country
0,22,7,2671,9,12,4,1,1,25,0,39,39
1,33,6,2926,9,12,2,4,0,0,0,12,39
2,21,4,14086,11,8,0,6,1,0,0,39,39
3,36,4,15336,1,6,2,6,0,0,0,39,39
4,11,4,19355,9,12,2,10,5,0,0,39,5
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,10,4,16528,7,11,2,13,5,0,0,37,39
32557,23,4,8080,11,8,2,7,0,0,0,39,39
32558,41,4,7883,11,8,6,1,4,0,0,39,39
32559,5,4,12881,11,8,4,1,3,0,0,19,39


In [17]:
fin_data=pd.concat([dummies,var1],axis=1)
fin_data

Unnamed: 0,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age,...,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country
0,0,0,0,0,1,0,1,1,0,22,...,2671,9,12,4,1,1,25,0,39,39
1,0,0,0,0,1,0,1,1,0,33,...,2926,9,12,2,4,0,0,0,12,39
2,0,0,0,0,1,0,1,1,0,21,...,14086,11,8,0,6,1,0,0,39,39
3,0,0,1,0,0,0,1,1,0,36,...,15336,1,6,2,6,0,0,0,39,39
4,0,0,1,0,0,1,0,1,0,11,...,19355,9,12,2,10,5,0,0,39,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,0,1,1,0,1,0,10,...,16528,7,11,2,13,5,0,0,37,39
32557,0,0,0,0,1,0,1,0,1,23,...,8080,11,8,2,7,0,0,0,39,39
32558,0,0,0,0,1,1,0,1,0,41,...,7883,11,8,6,1,4,0,0,39,39
32559,0,0,0,0,1,0,1,1,0,5,...,12881,11,8,4,1,3,0,0,19,39


## Task-3. Feature Engineering:

In [18]:
#CREATING  NEW FEATURES
fin_data['capital-gain-minus-loss'] = fin_data['capital_gain'] - fin_data['capital_loss']
fin_data['age_squared'] = fin_data['age'] ** 2
fin_data['hours_per_month']=fin_data['hours_per_week']*4

In [19]:
fin_data

Unnamed: 0,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age,...,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country,capital-gain-minus-loss,age_squared,hours_per_month
0,0,0,0,0,1,0,1,1,0,22,...,4,1,1,25,0,39,39,25,484,156
1,0,0,0,0,1,0,1,1,0,33,...,2,4,0,0,0,12,39,0,1089,48
2,0,0,0,0,1,0,1,1,0,21,...,0,6,1,0,0,39,39,0,441,156
3,0,0,1,0,0,0,1,1,0,36,...,2,6,0,0,0,39,39,0,1296,156
4,0,0,1,0,0,1,0,1,0,11,...,2,10,5,0,0,39,5,0,121,156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,0,1,1,0,1,0,10,...,2,13,5,0,0,37,39,0,100,148
32557,0,0,0,0,1,0,1,0,1,23,...,2,7,0,0,0,39,39,0,529,156
32558,0,0,0,0,1,1,0,1,0,41,...,6,1,4,0,0,39,39,0,1681,156
32559,0,0,0,0,1,0,1,1,0,5,...,4,1,3,0,0,19,39,0,25,76


## Task-4. Feature Selection:

In [20]:
iso= IsolationForest(random_state=10,contamination=0.01)
iso.fit(fin_data)

In [21]:
preds=iso.predict(fin_data)
preds

array([1, 1, 1, ..., 1, 1, 1])

In [22]:
outliers=pd.DataFrame(preds)
outliers

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
32556,1
32557,1
32558,1
32559,1


In [23]:
outliers = outliers.rename(columns={0: 'outliers'})
fin_data = fin_data.join(outliers, how='right')

In [24]:
fin_data

Unnamed: 0,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age,...,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country,capital-gain-minus-loss,age_squared,hours_per_month,outliers
0,0,0,0,0,1,0,1,1,0,22,...,1,1,25,0,39,39,25,484,156,1
1,0,0,0,0,1,0,1,1,0,33,...,4,0,0,0,12,39,0,1089,48,1
2,0,0,0,0,1,0,1,1,0,21,...,6,1,0,0,39,39,0,441,156,1
3,0,0,1,0,0,0,1,1,0,36,...,6,0,0,0,39,39,0,1296,156,1
4,0,0,1,0,0,1,0,1,0,11,...,10,5,0,0,39,5,0,121,156,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,0,1,1,0,1,0,10,...,13,5,0,0,37,39,0,100,148,1
32557,0,0,0,0,1,0,1,0,1,23,...,7,0,0,0,39,39,0,529,156,1
32558,0,0,0,0,1,1,0,1,0,41,...,1,4,0,0,39,39,0,1681,156,1
32559,0,0,0,0,1,0,1,1,0,5,...,1,3,0,0,19,39,0,25,76,1


In [30]:
#DETECTING OUTLIERS
fin_data[fin_data['outliers']==-1]

Unnamed: 0,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age,...,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country,capital-gain-minus-loss,age_squared,hours_per_month,outliers
27,0,1,0,0,0,0,1,0,1,37,...,0,0,0,0,59,35,0,1369,236,-1
52,0,0,0,0,1,1,0,0,1,30,...,10,5,0,47,59,16,-47,900,236,-1
93,0,1,0,0,0,1,0,1,0,13,...,12,5,0,21,34,0,-21,169,136,-1
157,0,0,1,0,0,0,1,1,0,54,...,12,4,0,41,1,39,-41,2916,4,-1
222,0,0,1,0,0,0,1,1,0,72,...,8,1,0,65,39,39,-65,5184,156,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32341,0,0,0,0,1,0,1,0,1,57,...,3,1,109,0,7,11,109,3249,28,-1
32370,0,1,0,0,0,0,1,0,1,36,...,10,1,115,0,39,30,115,1296,156,-1
32432,0,0,0,1,0,0,1,1,0,19,...,8,1,0,0,69,26,0,361,276,-1
32525,0,0,0,0,1,1,0,1,0,64,...,0,4,0,0,0,0,0,4096,0,-1


In [26]:
pps.score(fin_data,'age','income_ >50K')

{'x': 'age',
 'y': 'income_ >50K',
 'ppscore': 0,
 'case': 'regression',
 'is_valid_score': True,
 'metric': 'mean absolute error',
 'baseline_score': 0.2424,
 'model_score': 0.32837761728313286,
 'model': DecisionTreeRegressor()}

In [29]:
pps.matrix(fin_data)

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,race_ Amer-Indian-Eskimo,race_ Amer-Indian-Eskimo,1.0,predict_itself,True,,0.0000,1.000000,
1,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,0.0,regression,True,mean absolute error,0.0338,0.065296,DecisionTreeRegressor()
2,race_ Amer-Indian-Eskimo,race_ Black,0.0,regression,True,mean absolute error,0.0946,0.171097,DecisionTreeRegressor()
3,race_ Amer-Indian-Eskimo,race_ Other,0.0,regression,True,mean absolute error,0.0072,0.014297,DecisionTreeRegressor()
4,race_ Amer-Indian-Eskimo,race_ White,0.0,regression,True,mean absolute error,0.1482,0.234005,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
620,outliers,native_country,0.0,regression,True,mean absolute error,2.3748,4.145379,DecisionTreeRegressor()
621,outliers,capital-gain-minus-loss,0.0,regression,True,mean absolute error,8.7110,12.163696,DecisionTreeRegressor()
622,outliers,age_squared,0.0,regression,True,mean absolute error,516.3844,552.932828,DecisionTreeRegressor()
623,outliers,hours_per_month,0.0,regression,True,mean absolute error,30.3624,31.043605,DecisionTreeRegressor()
