# TOP 10 JUTSUS OF FEATURE ENGINEERING EVERY DATA SCIENTISTS NINJA SHOULD KNOW

### 1.) Techniques of Imputation of numerical and categorical data

##### Technique 1: Dropping rows and columns

In [None]:
threshold = 0.75

# Columns:
data = data[data.columns[data.isnull().mean() < threshold]]

#---------------------------------------------------------------------------------------------------------------

# Rows:
data = data.loc[data.columns[data.isnull().mean(axis=1) < threshold]]

##### Technique 2: Fill the NaN values with Mean, Median, Mode

In [None]:
# fill NaN values with 0
data = data.fillna(0)

#---------------------------------------------------------------------------------------------------------------

# fill NaN values with median
data = data.fillna(data.mean())

#---------------------------------------------------------------------------------------------------------------

# fill NaN values with median: Better than Mean
data = data.fillna(data.median())

#---------------------------------------------------------------------------------------------------------------

# fill NaN values with mode
data = data.fillna(data.mode())

##### Technique 3: Fill the NaN values using Imputer   &emsp;&emsp;&emsp;        (Only for numerical data) 

In [None]:
# using simple imputer
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='median') # can be mean, median, mode

imputer = imputer.fit(x)
data = imputer.transform(x)

print('Imputed Data:',data)

#---------------------------------------------------------------------------------------------------------------

# IterativeImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer_it = IterativeImputer()
imputer_it.fit_transform(x)

#---------------------------------------------------------------------------------------------------------------

# KNNImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer

imputer_knn = KNNImputer(n_neighbors=2)
imputer_knn.fit_transform(x)

##### Technique 4: Fill the NaN values    &emsp;&emsp;&emsp;        (Only for Categorical & Nominal data) 

In [None]:
data['categorical_col'] = data['categorical_col'].fillna(data['categorical_col'].mode()[0])

<br>
_____________________________________________________________________________________________________________________________

### 2.) Detecting outliers and dealing with it

<b>---Detecting Outliers---</b>

#### Using Z-score

In [6]:
import numpy as np

box = [10,20,15,25,11,12,16,26,19,29,3000]

outliers = []

def detect_outliers(data):
    Threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    
    for i in data:
        z_score = (i-mean)/std
        if np.abs(z_score) > Threshold:
            outliers.append(i)
    return outliers

out = detect_outliers(box)
out

[3000]

#### IQR

In [13]:
import numpy as np

box = [10,20,15,25,11,12,16,26,19,29,3000]

box = sorted(box)

Q1,Q3 = np.percentile(box, [25,75])

print(Q1,Q3)

iqr = Q3 - Q1
print(iqr)

# Lower bound and higher bound values

lower_bound_val = Q1 - (1.5*iqr)
higher_bound_val = Q3 + (1.5*iqr)

print(lower_bound_val,higher_bound_val)

13.5 25.5
12.0
-4.5 43.5


#### Box-plot

In [None]:
for feature in data:
    dataset = data.copy()
    
    if 0 in dataset[feature].unique():
        pass
    else:
        dataset[feature] = np.log(dataset[feature])
        dataset.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

<b>---Dealing with outliers---</b>

#### Dropping Outlier rows with standard deviation

In [None]:
fact = 3

upper_bound_val = data['column'].mean() + data['column'].std() * fact
lower_bound_val = data['column'].mean() - data['column'].std() * fact


data = data[(data['column'] < upper_bound_val) & (data['column'] > lower_bound_val)]

#### Dropping the outlier rows with percentile

In [None]:
upper_bound_val = data['column'].quartile(0.95)
lower_bound_val = data['column'].quartile(0.05)

data = data[(data['column'] < upper_bound_val) & (data['column'] > lower_bound_val)]

<br>
_____________________________________________________________________________________________________________________________

### 3.) Binning

In [None]:
#Numerical Binning Example
Value      Bin       

0-30   ->  Fail       
31-70  ->  Average       
71-100 ->  Excelent


#Categorical Binning Example
Value        Bin       

Mumbai   ->  Maharashtra      
Pune     ->  Maharashtra       
Bikaner  ->  Rajasthan
Jaipur   ->  Rajasthan

<b>Numerical Binning Example</b>

In [None]:
data['bin'] = pd.cut(data['value'], bins=[0,30,70,100], labels=["Fail", "Average", "Excelent"])

    value | bin 
0  |&emsp;    2&emsp; &emsp; | Fail<br>
1  |&emsp;45&emsp;&emsp; | Average<br>
2  |&emsp;  7   &nbsp;&emsp;&emsp;| Fail<br>
3  |&emsp;   85&emsp;&emsp;| Excelent<br>
4  |&emsp;   28&emsp;&emsp;| Fail<br>


<b>Categorical Binning Example</b>

In [None]:
conditions = [
    data['State'].str.contains('Mumbai'),
    data['State'].str.contains('Pune'),
    data['State'].str.contains('Bikaner'),
    data['State'].str.contains('Jaipur')]

choices = ['Maharashtra', 'Maharashtra', 'Rajasthan', 'Rajasthan']

data['Continent'] = np.select(conditions, choices, default='Other')

    value    | bin 
0  |&emsp;  Mumbai&emsp;| Maharashtra<br>
1  |&emsp;Pune&emsp;&emsp;&nbsp; | Maharashtra<br>
2  |&emsp;Bikaner  &nbsp;&emsp;| Rajasthan<br>
3  |&emsp;   Delhi&emsp;&emsp;&nbsp;| Other<br>
4  |&emsp;   Jaipur&emsp;&emsp;| Rajasthan<br>


<br>

_____________________________________________________________________________________________________________________________

### 4.) Techniques of dealing with Gaussian-Distribution / Skewness

-Helps to handles skewed data and after transformation, the distribution becomes more approximate to the normal.

-Decreases the effect of the outliers to the normalizatin of magnitude difference and the model become more robust.

NOTE: The dat ayou apply log transform must be positive value, otherwise you receive an error. Also, you can add 1 to your data before transforming it.


<b>Log Transformation</b>

In [None]:
data['log_column'] = np.log(data['column']+1)
diagnostic_plots(data,'log_column')

<b>Reciprocal Transformation</b>

In [None]:
data['Reciprocal_column'] = 1/(data['column']+1)
diagnostic_plots(data,'Reciprocal_column')

<b>Square-Root Transformation</b>

In [None]:
data['sqr_column'] = data['column']**(1/2)

<br>
____________________________________________________________________________________________________________________________

### 5.) OneHotEncoding and OrdinalEncoder

In [None]:
# For single column
encoded_col = pd.get_dummies(data['column'], drop_first = True)

# For multi column
encoded_multi_col = pd.get_dummies(data,columns = ['column1', 'column2', 'columnN'], drop_first=True)

In [None]:
# OneHotEncoding is for unordered (Nominal) data
# Example: Male, Female

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(data[['column']])

In [None]:
# OrdinalEncoder is for ordered (Ordinal) data
# Example: First, Second, Third


from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['First', 'Second', 'Third'],
                                ['O','A','B','C']])

oe.fit_transform(data[['Rank', 'Grade']])

<br>
___________________________________________________________________________________________________________________________

### 6.) Feature splitting & extraction

data.name<br>
0  Arnold Schwarzenegger<br>
1  Natasha Romanova <br>
2  Sylvester Stallone<br>
3  Gal Gadot<br>
4  Dwayne Johnson<br>

<b>--- Feature Splitting ---</b>

In [3]:
import pandas as pd
import numpy as np
data = [('Arnold Schwarzenegger','M'), 
        ('Natasha Romanova','F'), 
        ('Sylvester Stallone','M'), 
        ('Gal Gadot','F'), 
        ('Dwayne Johnson','M')]

df = pd.DataFrame(data, columns=['name', 'gender'])
df

Unnamed: 0,name,gender
0,Arnold Schwarzenegger,M
1,Natasha Romanova,F
2,Sylvester Stallone,M
3,Gal Gadot,F
4,Dwayne Johnson,M


In [4]:
df.name

0    Arnold Schwarzenegger
1         Natasha Romanova
2       Sylvester Stallone
3                Gal Gadot
4           Dwayne Johnson
Name: name, dtype: object

In [5]:
# Extracting First Name
df.name.str.split(" ").map(lambda X : X[0])

0       Arnold
1      Natasha
2    Sylvester
3          Gal
4       Dwayne
Name: name, dtype: object

In [6]:
# Extracting Last Name
df.name.str.split(" ").map(lambda X : X[-1])

0    Schwarzenegger
1          Romanova
2          Stallone
3             Gadot
4           Johnson
Name: name, dtype: object

<b>--- Feature Extraction ---</b>

In [7]:
weather_data = [('1/1/2017', 32, 6, 'Rain'),
               ('1/2/2017', 30, 7, 'Sunny'),
               ('1/3/2017', 32, 2, 'Snow'),
               ('1/4/2017', 34, 6, 'Snow'),
               ('1/5/2017', 32, 4, 'Rain'),
               ('1/6/2017', 32, 2, 'Sunny')
               ]

df = pd.DataFrame(weather_data, columns=['day', 'temp', 'windspeed', 'weather'])
df

Unnamed: 0,day,temp,windspeed,weather
0,1/1/2017,32,6,Rain
1,1/2/2017,30,7,Sunny
2,1/3/2017,32,2,Snow
3,1/4/2017,34,6,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,32,2,Sunny


In [8]:
df['day'][df['weather']=='Rain']

0    1/1/2017
4    1/5/2017
Name: day, dtype: object

In [9]:
df.day[df.temp == df.temp.max()]

3    1/4/2017
Name: day, dtype: object

<br>
______________________________________________________________________________________________________________________________

### 7.) Group By

In [10]:
weather_data = [('1/1/2017', 'Mumbai' , 32, 6, 'Rain'),
                ('1/2/2017', 'Pune' , 30, 7, 'Sunny'),
                ('1/3/2017', 'Mumbai' , 32, 2, 'Snow'),
                ('1/4/2017', 'Pune' , 34, 6, 'Snow'),
                ('1/5/2017', 'Mumbai' , 32, 4, 'Rain'),
                ('1/6/2017', 'Delhi' , 32, 2, 'Sunny')
               ]

data = pd.DataFrame(weather_data, columns=['day', 'city', 'temp', 'windspeed', 'weather'])
data

Unnamed: 0,day,city,temp,windspeed,weather
0,1/1/2017,Mumbai,32,6,Rain
1,1/2/2017,Pune,30,7,Sunny
2,1/3/2017,Mumbai,32,2,Snow
3,1/4/2017,Pune,34,6,Snow
4,1/5/2017,Mumbai,32,4,Rain
5,1/6/2017,Delhi,32,2,Sunny


In [12]:
grp_city = data.groupby('city')
grp_city

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E3E4CA2188>

In [13]:
for city, city_data in grp_city:
    print(city)
    print(city_data)

Delhi
        day   city  temp  windspeed weather
5  1/6/2017  Delhi    32          2   Sunny
Mumbai
        day    city  temp  windspeed weather
0  1/1/2017  Mumbai    32          6    Rain
2  1/3/2017  Mumbai    32          2    Snow
4  1/5/2017  Mumbai    32          4    Rain
Pune
        day  city  temp  windspeed weather
1  1/2/2017  Pune    30          7   Sunny
3  1/4/2017  Pune    34          6    Snow


In [14]:
# Get specific group

grp_city.get_group('Mumbai')

Unnamed: 0,day,city,temp,windspeed,weather
0,1/1/2017,Mumbai,32,6,Rain
2,1/3/2017,Mumbai,32,2,Snow
4,1/5/2017,Mumbai,32,4,Rain


In [15]:
print(grp_city.max())

             day  temp  windspeed weather
city                                     
Delhi   1/6/2017    32          2   Sunny
Mumbai  1/5/2017    32          6    Snow
Pune    1/4/2017    34          7   Sunny


In [16]:
print(grp_city.mean())

        temp  windspeed
city                   
Delhi   32.0        2.0
Mumbai  32.0        4.0
Pune    32.0        6.5


In [17]:
print(grp_city.describe())

        temp                                               windspeed       \
       count  mean       std   min   25%   50%   75%   max     count mean   
city                                                                        
Delhi    1.0  32.0       NaN  32.0  32.0  32.0  32.0  32.0       1.0  2.0   
Mumbai   3.0  32.0  0.000000  32.0  32.0  32.0  32.0  32.0       3.0  4.0   
Pune     2.0  32.0  2.828427  30.0  31.0  32.0  33.0  34.0       2.0  6.5   

                                             
             std  min   25%  50%   75%  max  
city                                         
Delhi        NaN  2.0  2.00  2.0  2.00  2.0  
Mumbai  2.000000  2.0  3.00  4.0  5.00  6.0  
Pune    0.707107  6.0  6.25  6.5  6.75  7.0  


<br>
_____________________________________________________________________________________________________________________________

### 8.) Concat, Merge, Join

In [21]:
Maharashtra_weather_data = pd.DataFrame({
    'city': ['Mumbai', 'Pune', 'Thane'],
    'temperature': [32, 45, 30],
    'humidity': [80, 60, 78]
})

Maharashtra_weather_data

Unnamed: 0,city,temperature,humidity
0,Mumbai,32,80
1,Pune,45,60
2,Thane,30,78


In [22]:
Gujarat_weather_data = pd.DataFrame({
    'city': ['Surat', 'Rajkot', 'Mehsana'],
    'temperature': [21, 24, 35],
    'humidity': [68, 65, 75]
})

Gujarat_weather_data

Unnamed: 0,city,temperature,humidity
0,Surat,21,68
1,Rajkot,24,65
2,Mehsana,35,75


#### * Technique 1: Conat

##### Column Wise Concatination

In [24]:
df = pd.concat([Maharashtra_weather_data, Gujarat_weather_data])
df

Unnamed: 0,city,temperature,humidity
0,Mumbai,32,80
1,Pune,45,60
2,Thane,30,78
0,Surat,21,68
1,Rajkot,24,65
2,Mehsana,35,75


##### Row Wise Concatination

In [25]:
df = pd.concat([Maharashtra_weather_data, Gujarat_weather_data], axis=1)
df

Unnamed: 0,city,temperature,humidity,city.1,temperature.1,humidity.1
0,Mumbai,32,80,Surat,21,68
1,Pune,45,60,Rajkot,24,65
2,Thane,30,78,Mehsana,35,75


#### * Technique 2: Merge

In [26]:
temp_data = pd.DataFrame({
    'city':['Mumbai','Delhi','Banglore','Hydrabad'],
    'temp':[32,45,30,40]
})

temp_data

Unnamed: 0,city,temp
0,Mumbai,32
1,Delhi,45
2,Banglore,30
3,Hydrabad,40


In [29]:
humidity_data = pd.DataFrame({
    'city':['Mumbai','Delhi','Banglore'],
    'humidity':[62,65,70]
})

humidity_data

Unnamed: 0,city,humidity
0,Mumbai,62
1,Delhi,65
2,Banglore,70


Merge 2 dataframe without explicitly mentioning the index

In [30]:
df = pd.merge(temp_data, humidity_data, on='city')
df

Unnamed: 0,city,temp,humidity
0,Mumbai,32,62
1,Delhi,45,65
2,Banglore,30,70


#### * Technique 3: Join

In [31]:
df = pd.merge(temp_data, humidity_data, on='city',how='outer')
df

Unnamed: 0,city,temp,humidity
0,Mumbai,32,62.0
1,Delhi,45,65.0
2,Banglore,30,70.0
3,Hydrabad,40,


<br>
___________________________________________________________________________________________________________________________

### 9.)Scaling

In [34]:
data = pd.DataFrame({
    'name':['Ram', 'Lakhan', 'Shiva', 'Ria', 'Lucy', 'Suraj', 'Rohan', 'Anny', 'Priya', 'Niraj'],
    'age':[25,40,33,26,30,35,28,43,36,50],
    'Salary':[25000, 32000, 50000, 33000, 20000, 29000, 22000, 52000, 23000, 26000],
    'purchased':[1,0,1,1,0,0,0,1,0,1]
})

data

Unnamed: 0,name,age,Salary,purchased
0,Ram,25,25000,1
1,Lakhan,40,32000,0
2,Shiva,33,50000,1
3,Ria,26,33000,1
4,Lucy,30,20000,0
5,Suraj,35,29000,0
6,Rohan,28,22000,0
7,Anny,43,52000,1
8,Priya,36,23000,0
9,Niraj,50,26000,1


In [40]:
from sklearn.model_selection import train_test_split

X = data.drop(['name','purchased'], axis=1)
y = data['purchased']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [42]:
from sklearn.preprocessing import MinMaxScaler

Min_Max_scaler = MinMaxScaler()

Min_Max_X_train = Min_Max_scaler.fit_transform(X_train)
Min_Max_X_test = Min_Max_scaler.transform(X_test)

In [44]:
Min_Max_X_train

array([[0.        , 0.40625   ],
       [0.08333333, 0.0625    ],
       [0.375     , 0.28125   ],
       [1.        , 0.1875    ],
       [0.29166667, 0.9375    ],
       [0.70833333, 1.        ],
       [0.16666667, 0.        ],
       [0.58333333, 0.375     ]])

In [45]:
Min_Max_X_test

array([[ 0.41666667,  0.09375   ],
       [-0.04166667,  0.15625   ]])

In [46]:
from sklearn.preprocessing import StandardScaler

StandardScaler_scaler = StandardScaler()

StandardScaler_X_train = StandardScaler_scaler.fit_transform(X_train)
StandardScaler_X_test = StandardScaler_scaler.transform(X_test)

In [47]:
StandardScaler_X_train

array([[-1.25589856,  0.        ],
       [-0.99493263, -0.98093152],
       [-0.08155185, -0.35670237],
       [ 1.87569265, -0.62422915],
       [-0.34251779,  1.51598507],
       [ 0.96231188,  1.69433626],
       [-0.73396669, -1.1592827 ],
       [ 0.57086298, -0.08917559]])

In [48]:
StandardScaler_X_test

array([[ 0.04893111, -0.89175592],
       [-1.38638153, -0.71340474]])

<br>
_____________________________________________________________________________________________________________________________

### 10.) Extracting Date

In [49]:
from datetime import date

data = pd.DataFrame({
    'date': ['01/01/2017', '04/12/2000', '23/04/2011', '11/02/2008', '08/08/2018']
})

In [50]:
data

Unnamed: 0,date
0,01/01/2017
1,04/12/2000
2,23/04/2011
3,11/02/2008
4,08/08/2018


In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    5 non-null      object
dtypes: object(1)
memory usage: 168.0+ bytes


In [55]:
# Transform String to Date

data['date'] = pd.to_datetime(data.date, format='%d/%m/%Y')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    5 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 168.0 bytes


In [60]:
# Extracat year

data['year'] = data['date'].dt.year
data

Unnamed: 0,date,year,month
0,2017-01-01,2017,1
1,2000-12-04,2000,12
2,2011-04-23,2011,4
3,2008-02-11,2008,2
4,2018-08-08,2018,8


In [58]:
# Extract month

data['month'] = data['date'].dt.month
data

Unnamed: 0,date,year,month
0,2017-01-01,2017,1
1,2000-12-04,2000,12
2,2011-04-23,2011,4
3,2008-02-11,2008,2
4,2018-08-08,2018,8


In [61]:
# Extract day

data['day'] = data['date'].dt.day
data

Unnamed: 0,date,year,month,day
0,2017-01-01,2017,1,1
1,2000-12-04,2000,12,4
2,2011-04-23,2011,4,23
3,2008-02-11,2008,2,11
4,2018-08-08,2018,8,8


In [62]:
# Extracting passed year since the date 

data['passed_years'] = date.today().year - data['date'].dt.year

data

Unnamed: 0,date,year,month,day,passed_years
0,2017-01-01,2017,1,1,3
1,2000-12-04,2000,12,4,20
2,2011-04-23,2011,4,23,9
3,2008-02-11,2008,2,11,12
4,2018-08-08,2018,8,8,2


In [63]:
# Extracting passed month since the date 

data['passed_months'] = (date.today().year - data['date'].dt.year)*12 + (date.today().month - data['date'].dt.month)

data

Unnamed: 0,date,year,month,day,passed_years,passed_months
0,2017-01-01,2017,1,1,3,46
1,2000-12-04,2000,12,4,20,239
2,2011-04-23,2011,4,23,9,115
3,2008-02-11,2008,2,11,12,153
4,2018-08-08,2018,8,8,2,27
