In [40]:
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import sklearn.preprocessing

In [165]:
# load tips dataset from pydata
tips= data('tips')
df= tips
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4



### a.Create a column named price_per_person. This should be the total bill divided by the party size.


In [166]:
df['price_per_person']= df['total_bill'] / df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [167]:
df.drop(columns=['sex','smoker','day','time'],inplace=True)
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person
1,16.99,1.01,2,8.495
2,10.34,1.66,3,3.446667
3,21.01,3.5,3,7.003333
4,23.68,3.31,2,11.84
5,24.59,3.61,4,6.1475


 ### b.Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
 
 - #### Ans: total. bill


In [168]:
# split data as always
from sklearn.model_selection import train_test_split
train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)


In [169]:
print(train.shape, validate.shape, test.shape)

(136, 4) (59, 4) (49, 4)


In [170]:
X_train= train.drop(columns=['tip'])
y_train = train.tip

X_validate=validate.drop(columns=['tip'])
y_validate=validate.tip

X_test= test.drop(columns=['tip'])
y_test= test.tip

In [149]:
X_train.head()

Unnamed: 0,total_bill,size,price_per_person
19,16.97,3,5.656667
173,7.25,2,3.625
119,12.43,2,6.215
29,21.7,2,10.85
238,32.83,2,16.415


In [171]:
# Scale the data
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler
scaler.fit(X_train)

# Use the scaler to transform train, validate,test
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [181]:
X_train_scaled_df=pd.DataFrame(X_train_scaled, columns= X_train.columns)
X_train_scaled_df.head()

Unnamed: 0,total_bill,size,price_per_person
0,0.307114,0.4,0.150344
1,0.092355,0.2,0.032258
2,0.206805,0.2,0.182796
3,0.411622,0.2,0.452194
4,0.657534,0.2,0.775647


### c.Use select k best to select the top 2 features for predicting tip amount. What are they?


In [173]:
#initialize f_top object
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fad2ad14430>)

In [174]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=X_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill,7.18647e-20,115.984909
size,1.341642e-12,61.259089
price_per_person,0.001310327,10.777792


In [180]:
# programmatic way to list k best
feature_mask= kbest.get_support()
f_feature= X_train_scaled_df.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'size']

In [177]:
# list of kbest another method
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [178]:
# transforming 
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[kbest.get_support()]
)
X_train_transformed.head()

Unnamed: 0,total_bill,size
19,16.97,3.0
173,7.25,2.0
119,12.43,2.0
29,21.7,2.0
238,32.83,2.0


### d.Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [184]:
# initialize linearregresiion algorithm
lr= LinearRegression()
rfe= RFE(lr, n_features_to_select =2)
rfe.fit(X_train_scaled, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [188]:
feature_mask= rfe.support_
rfe_features = X_train_scaled_df.iloc[:, feature_mask].columns.tolist()
rfe_features

['total_bill', 'price_per_person']

In [189]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train_scaled_df.columns)

Unnamed: 0,rfe_ranking
total_bill,1
size,2
price_per_person,1


### e.Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

- Ans: top features are different in kbest and rfe model. RFE model eliminate features based on importance of the model while kbest does not consider feature interactions.

## 2.Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [198]:
def select_kbest(X,y,k):
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()].tolist()


    
select_kbest(X_train_scaled_df, y_train,2)

['total_bill', 'size']

## 3.Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [194]:
def rfe(X,y,k):
    lm= LinearRegression()
    rfe= RFE(lm, n_features_to_select =k)
    rfe.fit(X, y)
    
    features_to_select =  X.columns[rfe.support_].tolist()
    return features_to_select
    

In [195]:
rfe(X_train_scaled_df, y_train, 2)

['total_bill', 'price_per_person']

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [122]:
swiss= data('swiss')

swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [123]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [124]:
# split data as always
from sklearn.model_selection import train_test_split
train_validate, test = train_test_split(swiss, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)



In [125]:
print(train.shape)

(25, 6)


In [126]:
X_train = train.drop(columns=['Fertility'])
y_train = train.Fertility

X_validate= validate.drop(columns=['Fertility'])
y_validate= validate.Fertility

X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [128]:
# Scale the data
scaler = sklearn.preprocessing.MinMaxScaler()


# Use the scaler to  fit and transform train,  transform(validate, test) 
X_train_scaled = scaler.fit_transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [130]:
X_train_scaled = pd.DataFrame(X_train_scaled,columns= X_train.columns)


In [132]:
X_train_scaled.shape

(25, 5)

In [133]:
# using Kbest
select_kbest(X_train_scaled,y_train, 3)

['Examination', 'Catholic', 'Infant.Mortality']

In [134]:
# Using RFE
rfe(X_train_scaled, y_train, 3)

['Agriculture', 'Examination', 'Infant.Mortality']