In [1]:
import pandas as pd
import numpy as np
import pydataset

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# 1. Load the tips dataset.
tips = pydataset.data('tips')
df= tips

In [3]:
df.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

In [4]:
# let's change smoker to binary and make time a numeric value
tips['smoker'] = (tips.smoker == 'Yes').astype(int)
tips['dinner'] = (tips.time == 'Dinner').astype(int)

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner
1,16.99,1.01,Female,0,Sun,Dinner,2,1
2,10.34,1.66,Male,0,Sun,Dinner,3,1
3,21.01,3.5,Male,0,Sun,Dinner,3,1
4,23.68,3.31,Male,0,Sun,Dinner,2,1
5,24.59,3.61,Female,0,Sun,Dinner,4,1


In [6]:
X = tips[['total_bill', 'size', 'smoker', 'dinner']]
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# a. Create a column named tip_percentage. This should be the tip amount
# divided by the total bill.
df['tip_percentage'] = df['tip']/df['total_bill']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner,tip_percentage
1,16.99,1.01,Female,0,Sun,Dinner,2,1,0.059447
2,10.34,1.66,Male,0,Sun,Dinner,3,1,0.160542
3,21.01,3.5,Male,0,Sun,Dinner,3,1,0.166587
4,23.68,3.31,Male,0,Sun,Dinner,2,1,0.13978
5,24.59,3.61,Female,0,Sun,Dinner,4,1,0.146808


In [8]:
# b. Create a column named price_per_person. This should be the total 
# bill divided by the party size.
df['price_per_person']= df['total_bill']/df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,dinner,tip_percentage,price_per_person
1,16.99,1.01,Female,0,Sun,Dinner,2,1,0.059447,8.495
2,10.34,1.66,Male,0,Sun,Dinner,3,1,0.160542,3.446667
3,21.01,3.5,Male,0,Sun,Dinner,3,1,0.166587,7.003333
4,23.68,3.31,Male,0,Sun,Dinner,2,1,0.13978,11.84
5,24.59,3.61,Female,0,Sun,Dinner,4,1,0.146808,6.1475


In [9]:
# c. Before using any of the methods discussed in the lesson, which features do you think 
# would be most important for predicting the tip amount? The tip percentage?

# total_bill, party size and time of meal

In [10]:
#d. Use select k best and recursive feature elimination to select the top 2 features for 
# predicting tip amount. What are they?

#kbest
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fc003255940>)

In [11]:
kbest.get_support()

array([ True,  True, False, False])

In [12]:
kbest.transform

<bound method SelectorMixin.transform of SelectKBest(k=2, score_func=<function f_regression at 0x7fc003255940>)>

In [None]:
# so total_bill, and size are the top 2 features in predicting tip amount.

In [13]:
# Recursive Feature Elimination

rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True,  True, False, False])

In [14]:
# get ranking
rfe.ranking_

array([1, 1, 2, 3])

In [34]:
pd.Series(rfe.ranking_, index=X_train.columns)

total_bill    1
size          1
smoker        2
dinner        3
dtype: int64

In [15]:
# so recursive feature elimination ranks total_bill and size as the most important features also


In [17]:
# e. Use select k best and recursive feature elimination to select the 
# top 2 features for predicting tip percentage. What are they?

In [28]:
# kbest
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fc003255940>)

In [29]:
kbest.get_support()

array([ True,  True, False, False])

In [30]:
kbest.transform

<bound method SelectorMixin.transform of SelectKBest(k=2, score_func=<function f_regression at 0x7fc003255940>)>

In [22]:
# so total_bill and tip are the top 2 features to predict tip_percentage

In [31]:
# Recursive Feature Elimination

rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True,  True, False, False])

In [32]:
# get ranking
rfe.ranking_

array([1, 1, 2, 3])

In [None]:
# recursive feature elimination ranks the same two as most important total_bill and size

In [33]:
pd.Series(rfe.ranking_, index=X_train.columns)

total_bill    1
size          1
smoker        2
dinner        3
dtype: int64

In [23]:
# f. Why do you think select k best and recursive feature elimination 
# might give different answers for the top features? Does this change 
# as you change the number of features your are selecting?

# They might have different answers because kbest looks at the features
# in isolation whereas recursive feature elimination looks at all the features
# together. Yes the output changes if you use additional features.


In [24]:
# 2. Write a function named select_kbest that takes in the predictors 
# (X), the target (y), and the number of features to select (k) and 
# returns the names of the top k selected features based on the 
# SelectKBest class. Test your function with the tips dataset. You 
# should see the same results as when you did the process manually.

In [37]:
k = number_of_features = 2
def select_kbest(X, y, k):
    # use kbest
    kbest = SelectKBest(f_regression, k)
    kbest.fit(X_train_scaled, y_train)
    kbest.get_support()
    kbest.transform
    return print(pd.Series(kbest.get_support(), index=X_train.columns))
    

In [25]:
# 3. Write a function named rfe that takes in the predictors, the target, 
# and the number of features to select. It should return the top k 
# features based on the RFE class. Test your function with the tips 
# dataset. You should see the same results as when you did the process 
# manually.

In [38]:
k = number_of_features =2
def rfe(pred, target,k):
    rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
    rfe.fit(X_train_scaled, y_train)
    rfe.get_support()
    rfe.ranking_
    return print(pd.Series(rfe.ranking_, index=X_train.columns))
    

In [26]:
# 4. Load the swiss dataset and use all the other features to predict 
# Fertility. Find the top 3 features using both select k best and 
# recursive feature elimination (use the functions you just built to help 
# you out).

In [39]:
# load the dataset
df = pydataset.data('swiss')

In [40]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [41]:
# use all other features to predict fertility
X = df[['Agriculture','Examination','Education','Catholic','Infant.Mortality']]
y = df.Fertility

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
# kbest
kbest = SelectKBest(f_regression, k=3)
kbest.fit(X_train_scaled, y_train)

SelectKBest(k=3, score_func=<function f_regression at 0x7fc003255940>)

In [43]:
kbest.get_support()

array([False,  True,  True,  True, False])

In [44]:
X_train.columns[kbest.get_support()]

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [45]:
X_kbest = kbest.transform(X_train_scaled)
X_kbest.shape

(37, 3)

In [49]:
pd.Series(kbest.get_support(), index=X_train.columns)

Agriculture         False
Examination          True
Education            True
Catholic             True
Infant.Mortality    False
dtype: bool

In [46]:
# recursive elimination

rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([False, False,  True,  True, False])

In [47]:
# get ranking
rfe.ranking_

array([2, 4, 1, 1, 3])

In [48]:
pd.Series(rfe.ranking_, index=X_train.columns)

Agriculture         2
Examination         4
Education           1
Catholic            1
Infant.Mortality    3
dtype: int64