In [24]:
import pandas as pd
import numpy as np
#import wrangle
import warnings
warnings.filterwarnings("ignore")

import pydataset

from sklearn.feature_selection import SelectKBest, f_regression, RFE

from sklearn.linear_model import LinearRegression

import sklearn.preprocessing

In [3]:
tips = pydataset.data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips['tip_percentage'] = tips.tip / tips.total_bill
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [5]:
tips['price_per_person'] = tips.total_bill / tips.size
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,0.008704
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,0.005297
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,0.010763
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,0.012131
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,0.012597


price_per_person and tip_percentage have a high relationship in predicting where there is a high tip

In [10]:
tips.price_per_person

1      0.008704
2      0.005297
3      0.010763
4      0.012131
5      0.012597
         ...   
240    0.014872
241    0.013924
242    0.011614
243    0.009129
244    0.009621
Name: price_per_person, Length: 244, dtype: float64

In [12]:
tips[['price_per_person']]

Unnamed: 0,price_per_person
1,0.008704
2,0.005297
3,0.010763
4,0.012131
5,0.012597
...,...
240,0.014872
241,0.013924
242,0.011614
243,0.009129


## Scaling Data

In [35]:
scaler_standard = sklearn.preprocessing.StandardScaler()


# Fitting object and creating it
scaler_standard.fit(tips[['total_bill', 'size', 'price_per_person']])
tips[['total_bill_scaled', 'size_scaled', 'price_per_person_scaled']] = scaler_standard.transform(tips[['total_bill', 'size', 'price_per_person']])

# Fitting size
#scaler_standard.fit(tips[['size']])
#tips['size_scaled'] = scaler_standard.transform(tips[['size']])

# Fittine price per person
#scaler_standard.fit(tips[['price_per_person']])
#tips['price_per_person_scaled'] = scaler_standard.transform(tips[['price_per_person']])

In [39]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person,total_bill_scaled,size_scaled,price_per_person_scaled
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,0.008704,-0.314711,-0.600193,-0.314711
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,0.005297,-1.063235,0.453383,-1.063235
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,0.010763,0.13778,0.453383,0.13778
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,0.012131,0.438315,-0.600193,0.438315
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,0.012597,0.540745,1.506958,0.540745


In [40]:
tips[['total_bill_scaled', 'size_scaled', 'price_per_person_scaled']]

Unnamed: 0,total_bill_scaled,size_scaled,price_per_person_scaled
1,-0.314711,-0.600193,-0.314711
2,-1.063235,0.453383,-1.063235
3,0.137780,0.453383,0.137780
4,0.438315,-0.600193,0.438315
5,0.540745,1.506958,0.540745
...,...,...,...
240,1.040511,0.453383,1.040511
241,0.832275,-0.600193,0.832275
242,0.324630,-0.600193,0.324630
243,-0.221287,-0.600193,-0.221287


In [41]:
X_scaled_data = tips[['total_bill_scaled', 'size_scaled', 'price_per_person_scaled']]

## Feature engineering using SelectKBest

In [45]:
selector = SelectKBest(f_regression, k = 2)

selector.fit(X_scaled_data, tips.tip_percentage)

# boolean mask of whether the column was selected or not. 
feature_mask = selector.get_support()

# get list of top K features. 
f_feature = X_scaled_data.iloc[:,feature_mask].columns.tolist()

f_feature

['total_bill_scaled', 'price_per_person_scaled']

## Recursive Feature Engineering

In [46]:
lm = LinearRegression()

rfe = RFE(lm, 2)

rfe.fit(X_scaled_data,tips.tip_percentage)

feature_mask = rfe.support_

rfe_feature = X_scaled_data.iloc[:,feature_mask].columns.tolist()

rfe_feature

['total_bill_scaled', 'price_per_person_scaled']

## Creating Functions 

In [47]:
def select_kbest(x, y, k):
    selector = SelectKBest(f_regression, k)
    
    selector.fit(x, y)
    
    # boolean mask of whether the column was selected or not. 
    feature_mask = selector.get_support()

    # get list of top K features. 
    features  = x.iloc[:,feature_mask].columns.tolist()
    return features

In [48]:
select_kbest(X_scaled_data, tips.tip, 2)

['total_bill_scaled', 'price_per_person_scaled']

In [49]:
def rfe(x, y, k):
    lm = LinearRegression()

    rfe = RFE(lm, k)

    rfe.fit(x, y)

    feature_mask = rfe.support_

    rfe_features = x.iloc[:,feature_mask].columns.tolist()

    return rfe_features 

In [50]:
rfe(X_scaled_data, tips.tip, 2)

['total_bill_scaled', 'price_per_person_scaled']

In [51]:
swiss = pydataset.data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [None]:
swiss[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']]

In [52]:
def scaller_standard(x):
    scaler_standard = sklearn.preprocessing.StandardScaler()
    # Fitting object and creating it
    scaler_standard.fit(x)
    swiss[['Agriculture_scaled', 'Examination_scaled', 'Education_scaled', 'Catholic_Scaled', 'Infanct.Mortality_Scaled']] = scaler_standard.transform(x)
    return swiss[['Agriculture_scaled', 'Examination_scaled', 'Education_scaled', 'Catholic_Scaled', 'Infanct.Mortality_Scaled']]

    

In [55]:
 X_scaled_swiss = scaller_standard(swiss[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']])

In [54]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality,Agriculture_scaled,Examination_scaled,Education_scaled,Catholic_Scaled,Infanct.Mortality_Scaled
Courtelary,80.2,17.0,15,12,9.96,22.2,-1.498091,-0.188705,0.107361,-0.75581,0.783416
Delemont,83.1,45.1,6,9,84.84,22.2,-0.247441,-1.32902,-0.208012,1.059075,0.783416
Franches-Mnt,92.5,39.7,5,5,93.4,20.2,-0.487779,-1.455721,-0.628508,1.266546,0.089343
Moutier,85.8,36.5,12,7,33.77,20.3,-0.630202,-0.56881,-0.41826,-0.178721,0.124047
Neuveville,76.9,43.5,17,15,5.16,20.6,-0.318652,0.064699,0.422733,-0.872149,0.228158


In [56]:
select_kbest(X_scaled_swiss, swiss.Fertility, 3)

['Examination_scaled', 'Education_scaled', 'Catholic_Scaled']

In [57]:
rfe(X_scaled_swiss, swiss.Fertility, 3)

['Agriculture_scaled', 'Education_scaled', 'Catholic_Scaled']