In [8]:
# imports
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression
from pydataset import data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

### Load the tips dataset.

In [2]:
# acquiring data
df = sns.load_dataset("tips")

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
# dividing tip by total_bill and storing values in new column tip_percentage
df['tip_percentage'] = df['tip'] / df['total_bill']

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


### Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
# dividing total_bill by size and storing as new column, price_per_person
df['price_per_person'] = df['total_bill'] / df['size']

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [26]:
print('Tip percentage may be the best feature for predicting tip amount because it can be used to calculate the exact tip based on the total.')
print('Outside of tip percentage, I think size may have a strong correlation since people tend to tip more as the size of their party grows.')

Tip percentage may be the best feature for predicting tip amount because it can be used to calculate the exact tip based on the total.
Outside of tip percentage, I think size may have a strong correlation since people tend to tip more as the size of their party grows.


### Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

#### Using k best to find top 2 features for predicting tip amount

In [46]:
# creating feature selector object
# specifying to use f-regression as test for determining feature correlation to target variable
# k = 2 means we only want the top 2 features at output
f_selector_for_tip = SelectKBest(f_regression, k=2)

# setting x to feature columns
# setting y to target variable
x = df[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y = df[['tip']]

# fitting selector to data
f_selector_for_tip.fit(x, y)

# storing array of boolean values that reflect true if a feature was one of the two selected
# false otherwise
f_support = f_selector_for_tip.get_support()

# creating list of the top 2 features using boolean mask
f_feature = x.loc[:,f_support].columns.tolist()

# printing results
print(str(len(f_feature)), 'k best selected features for predicting tip amount')
print(f_feature)

2 k best selected features for predicting tip amount
['total_bill', 'size']


#### Using recursive feature elimination to find top 2 features for predicting tip amount

In [45]:
# setting x to feature columns
# setting y to target variable
x = df[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y = df[['tip']]

# creating linear regression object
lm = LinearRegression()

# creating RFE object
# specifying to use our linear regression object and only pick top 2 features
rfe = RFE(lm, 2)

# transforming data using rfe object
x_rfe = rfe.fit_transform(x, y)

# fitting our linear regression model to data
lm.fit(x, y)

# storing array of boolean values that reflect true if a feature was one of the two selected
# false otherwise
mask = rfe.support_

# creating list of the top 2 features using boolean mask
rfe_features = x.loc[:,mask].columns.tolist()

# displaying results
print(str(len(rfe_features)), 'RFE selected features for predicting tip amount')
print(rfe_features)

2 RFE selected features for predicting tip amount
['total_bill', 'tip_percentage']


### Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

#### Using k best to find top 2 features for predicting tip_percentage

In [47]:
# creating feature selector object
# specifying to use f-regression as test for determining feature correlation to target variable
# k = 2 means we only want the top 2 features at output
f_selector_for_tip_percentage = SelectKBest(f_regression, k=2)

# setting x to feature columns
# setting y to target variable
x = df[['total_bill', 'size', 'tip', 'price_per_person']]
y = df[['tip_percentage']]

# fitting selector to data
f_selector_for_tip_percentage.fit(x, y)

# storing array of boolean values that reflect true if a feature was one of the two selected
# false otherwise
f_support = f_selector_for_tip_percentage.get_support()

# creating list of the top 2 features using boolean mask
f_feature = x.loc[:,f_support].columns.tolist()

# printing results
print(str(len(f_feature)), 'k best selected features for predicting tip percentage')
print(f_feature)

2 k best selected features for predicting tip percentage
['total_bill', 'tip']


#### Using recursive feature elimination to find top 2 features for predicting tip percentage

In [5]:
# setting x to feature columns
# setting y to target variable
x = df[['total_bill', 'size', 'tip', 'price_per_person']]
y = df[['tip_percentage']]

# creating linear regression object
lm = LinearRegression()

# creating RFE object
# specifying to use our linear regression object and only pick top 2 features
rfe = RFE(lm, 2)

# transforming data using rfe object
x_rfe = rfe.fit_transform(x, y)

# fitting our linear regression model to data
lm.fit(x, y)

# storing array of boolean values that reflect true if a feature was one of the two selected
# false otherwise
mask = rfe.support_

# creating list of the top 2 features using boolean mask
rfe_features = x.loc[:,mask].columns.tolist()

# displaying results
print(str(len(rfe_features)), 'RFE selected features for predicting tip percentage')
print(rfe_features)

2 RFE selected features for predicting tip percentage
['size', 'tip']


#### Why do you think select k best and recursive feature elimination might give different answers for the top features? 

They give different answers because their methods for determining best features are fairly different.

K best calculates a specified test statistic (such as chi square or F-regression) for each provided, non-target variable. It then selects the two variables with the highest correlation to the target. If any of the variables it selects have a high correlation with each other, it only selects one.

RFE instead builds models using different combinations of provided non-target variables and ranks them based on their output. In effect, because its actually making models and judging their outputs as it means of feature selecting, it tends to be more accurate.

### Does this change as you change the number of features your are selecting?

#### Using k best to find top 3 features for predicting tip amount

In [15]:
# creating feature selector object
# specifying to use f-regression as test for determining feature correlation to target variable
# k = 3 means we only want the top 3 features at output
f_selector_for_tip = SelectKBest(f_regression, k = 3)

# setting x to feature columns
# setting y to target variable
x = df[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y = df[['tip']]

# fitting selector to data
f_selector_for_tip.fit(x, y)

# storing array of boolean values that reflect true if a feature was one of the three selected
# false otherwise
f_support = f_selector_for_tip.get_support()

# creating list of the top 3 features using boolean mask
f_feature = x.loc[:,f_support].columns.tolist()

# printing results
print(str(len(f_feature)), 'k best selected features for predicting tip amount')
print(f_feature)

3 k best selected features for predicting tip amount
['total_bill', 'size', 'price_per_person']


#### Using recursive feature elimination to find top 3 features for predicting tip amount

In [17]:
# setting x to feature columns
# setting y to target variable
x = df[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y = df[['tip']]

# creating linear regression object
lm = LinearRegression()

# creating RFE object
# specifying to use our linear regression object and only pick top 3 features
rfe = RFE(lm, 3)

# transforming data using rfe object
x_rfe = rfe.fit_transform(x, y)

# fitting our linear regression model to data
lm.fit(x, y)

# storing array of boolean values that reflect true if a feature was one of the three selected
# false otherwise
mask = rfe.support_

# creating list of the top 3 features using boolean mask
rfe_features = x.loc[:,mask].columns.tolist()

# display results
print(str(len(rfe_features)), 'RFE selected features for predicting tip amount')
print(rfe_features)

3 RFE selected features for predicting tip amount
['total_bill', 'size', 'tip_percentage']


#### Using k best to find top 3 features for predicting tip_percentage

In [48]:
# creating feature selector object
# specifying to use f-regression as test for determining feature correlation to target variable
# k = 3 means we only want the top 3 features at output
f_selector_for_tip_percentage = SelectKBest(f_regression, k=3)

# setting x to feature columns
# setting y to target variable
x = df[['total_bill', 'size', 'tip', 'price_per_person']]
y = df[['tip_percentage']]

# fitting selector to data
f_selector_for_tip_percentage.fit(x, y)

# storing array of boolean values that reflect true if a feature was one of the three selected
# false otherwise
f_support = f_selector_for_tip_percentage.get_support()

# creating list of the top 3 features using boolean mask
f_feature = x.loc[:,f_support].columns.tolist()

# displaying results
print(str(len(f_feature)), 'k best selected features for predicting tip percentage')
print(f_feature)

3 k best selected features for predicting tip percentage
['total_bill', 'tip', 'price_per_person']


#### Using recursive feature elimination to find top 3 features for predicting tip percentage

In [20]:
# setting x to feature columns
# setting y to target variable
x = df[['total_bill', 'size', 'tip', 'price_per_person']]
y = df[['tip_percentage']]

# creating linear regression object
lm = LinearRegression()

# creating RFE object
# specifying to use our linear regression object and only pick top 3 features
rfe = RFE(lm, 3)

# transforming data using rfe object
x_rfe = rfe.fit_transform(x, y)

# fitting our linear regression model to data
lm.fit(x, y)

# storing array of boolean values that reflect true if a feature was one of the three selected
# false otherwise
mask = rfe.support_

# creating list of the top 3 features using boolean mask
rfe_features = x.loc[:,mask].columns.tolist()

# display results
print(str(len(rfe_features)), 'RFE selected features for predicting tip percentage')
print(rfe_features)

3 RFE selected features for predicting tip percentage
['size', 'tip', 'price_per_person']


 <font size="4">  Answer: No, after changing the amount of features selected to 3 for both methods and targeted variable, K Best and RFE still chose different feature sets.</font>

### Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [50]:
# creating function with 3 parameters
def select_kbest(x, y, kf):
    
    # creating feature selector object
    # specifying to use f-regression as test for determining feature correlation to target variable
    # number of features to select as best features will be specified by user (kf)
    f_selector = SelectKBest(f_regression, k = kf)
    
    # fitting selector to x and y provided by user
    f_selector.fit(x, y)

    # storing array of boolean values that reflect true if a feature was one of the three selected and false otherwise
    f_support = f_selector.get_support()

    # creating list of features using boolean mask equal to k specified by user
    f_feature = x.loc[:,f_support].columns.tolist()
    
    # returning feature names
    return f_feature

In [51]:
# testing function to see if results are same as before
x = df[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y = df[['tip']]

select_kbest(x,y,2)

['total_bill', 'size']

In [52]:
# testing function to see if results are same as before
x = df[['total_bill', 'size', 'tip', 'price_per_person']]
y = df[['tip_percentage']]

select_kbest(x,y,2)

['total_bill', 'tip']

 <font size="4"> Answer: Function works and produces the same features as its manual counterpart.</font>

### Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [54]:
# creating function with 3 parameters
def rfe(x, y, kf):
    
    # creating linear regression object
    lm = LinearRegression()
    
    # creating rfe object that use linear regression object and chooses number of features based on user choice (kf)
    rfe = RFE(lm, kf)
    
    # fitting rfe object to provided data
    x_rfe = rfe.fit_transform(x, y)
    
    # fitting linear regression object to data provided
    lm.fit(x, y)
    
    # creating boolean mask that holds true for any features deemed as the best (number chosen depends on user input)
    mask = rfe.support_
    
    # storing best feature names as list
    rfe_features = x.loc[:,mask].columns.tolist()
    
    # return list of best features
    return rfe_features

In [55]:
# testing function to confirm its results match the previous ones
x = df[['total_bill', 'size', 'tip_percentage', 'price_per_person']]
y = df[['tip']]

rfe(x, y, 2)

['total_bill', 'tip_percentage']

In [56]:
# testing function to confirm its results match the previous ones
x = df[['total_bill', 'size', 'tip', 'price_per_person']]
y = df[['tip_percentage']]

rfe(x, y, 2)

['size', 'tip']

 <font size="4"> Answer: Function works and produces the same features as its manual counterpart. </font>

### Load the swiss dataset and use all the other features to predict Fertility. 

In [12]:
# acquiring data
swiss = data('swiss')

swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


### Find the top 3 features using select k best.

In [59]:
# setting x as features
# setting y as target variable
x = swiss.drop(columns = 'Fertility')
y = swiss['Fertility']

# using function from previous question 
# passing x and y
# specifying to find top 3 features
kbest_fertility = select_kbest(x, y, 3)

# displaying results
print(f'The 3 best features for modeling selected by kbest are {kbest_fertility}.')

The 3 best features for modeling selected by kbest are ['Examination', 'Education', 'Catholic'].


### Find the top 3 features using recursive feature elimination.

In [60]:
# setting x as features
# setting y as target variable
x = swiss.drop(columns = 'Fertility')
y = swiss['Fertility']

# using function from previous question 
# passing x and y
# specifying to find top 3 features
rfe_fertility = rfe(x, y, 3)

# displaying results
print(f'The 3 best features for modeling selected by RFE are {rfe_fertility}.')

The 3 best features for modeling selected by RFE are ['Examination', 'Education', 'Infant.Mortality'].
