In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from datetime import date
import seaborn as sns
from pydataset import data
from env import host, user, password
import os
from sklearn.feature_selection import SelectKBest, f_regression
import sklearn.preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [2]:
tips = data('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
# Create a column named tip_percentage. This should be the tip amount divided by the total bill.

tips['tip_percentage'] = tips['tip'] / tips['total_bill']

In [4]:
# Create a column named price_per_person. This should be the total bill divided by the party size.

tips['price_per_person'] = tips['total_bill'] / tips['size']

In [22]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   tip_percentage    244 non-null    float64
 8   price_per_person  244 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 19.1+ KB


In [6]:
# Before using any of the methods discussed in the lesson, which features do you think would be most important 
# for predicting the tip amount? The tip percentage?

# I predict that total_bill will be the best indicator of tip amount

In [7]:
num_tips = tips.drop(columns = ['sex', 'smoker', 'day', 'time'])

In [8]:
num_tips

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person
1,16.99,1.01,2,0.059447,8.495000
2,10.34,1.66,3,0.160542,3.446667
3,21.01,3.50,3,0.166587,7.003333
4,23.68,3.31,2,0.139780,11.840000
5,24.59,3.61,4,0.146808,6.147500
...,...,...,...,...,...
240,29.03,5.92,3,0.203927,9.676667
241,27.18,2.00,2,0.073584,13.590000
242,22.67,2.00,2,0.088222,11.335000
243,17.82,1.75,2,0.098204,8.910000


In [9]:
scaler_minmax = sklearn.preprocessing.MinMaxScaler()
scaler_minmax.fit(num_tips)

MinMaxScaler()

In [10]:
tips_minmax = scaler_minmax.transform(num_tips)
tips_minmax = pd.DataFrame(tips_minmax)
tips_minmax.rename(columns = {0: 'total_bill', 1: 'tip', 2: 'size', 3: 'tip_percentage', 4: 'price_per_person'}, inplace = True)

In [11]:
tips_minmax.drop(columns = ['tip'], inplace = True)

In [12]:
# Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination 
# to select the top 2 features. What are they?
# "total_bill" and "size"

f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(tips_minmax, tips['tip'])
feature_mask = f_selector.get_support()
f_feature = tips_minmax.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'size']

In [13]:
# Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination 
# to select the top 2 features. What are they?
# "total_bill" and "tip_percentage"

lm = LinearRegression()
rfe = RFE(lm, 2)
rfe.fit(tips_minmax, tips['tip'])
feature_mask_rfe = rfe.support_
rfe_feature = tips_minmax.iloc[:,feature_mask_rfe].columns.tolist()
rfe_feature



['total_bill', 'tip_percentage']

In [14]:
tips_minmax = scaler_minmax.transform(num_tips)
tips_minmax = pd.DataFrame(tips_minmax)
tips_minmax.rename(columns = {0: 'total_bill', 1: 'tip', 2: 'size', 3: 'tip_percentage', 4: 'price_per_person'}, inplace = True)

In [15]:
tips_minmax.drop(columns = ['tip_percentage'], inplace = True)

In [16]:
# Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination
# to select the top 2 features. What are they?
# "total_bill" and "tip"

f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(tips_minmax, tips['tip_percentage'])
feature_mask = f_selector.get_support()
f_feature = tips_minmax.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'tip']

In [17]:
# Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination
# to select the top 2 features. What are they?
# "total_bill" and "tip"

lm = LinearRegression()
rfe = RFE(lm, 2)
rfe.fit(tips_minmax, tips['tip_percentage'])
feature_mask_rfe = rfe.support_
rfe_feature = tips_minmax.iloc[:,feature_mask_rfe].columns.tolist()
rfe_feature



['total_bill', 'tip']

In [18]:
# Why do you think select k best and recursive feature elimination might give different answers for the top 
# features? Does this change as you change the number of features your are selecting?

# They have different outputs because of the way each operates. K best uses statistical methods to see which 
# features correlate most strongly with the target variable and gets rid of those features that correlate strongly
# with each other. Recursive eliminations creates models and finds which features are weighed 
# the strongest in those models.

In [27]:
# Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of 
# features to select (k) and returns the names of the top k selected features based on the SelectKBest class.
# Test your function with the tips dataset. You should see the same results as when you did the process manually.

def select_kbest(db, target, num_features):
    db = db.select_dtypes(include = ['float64', 'int64'])
    scaler_minmax = sklearn.preprocessing.MinMaxScaler()
    scaler_minmax.fit(db)
    db_minmax = scaler_minmax.transform(db)
    db_minmax = pd.DataFrame(db_minmax)
    key = db.columns.tolist()
    db_minmax.rename(columns = {i: key[i] for i in range(len(key))} , inplace = True)
    db_minmax.drop(columns = [target], inplace = True)
    f_selector = SelectKBest(f_regression, k= num_features)
    f_selector.fit(db_minmax, db[target])
    feature_mask = f_selector.get_support()
    f_feature = db_minmax.iloc[:,feature_mask].columns.tolist()
    return f_feature

In [28]:
select_kbest(tips, 'tip', 2)

['total_bill', 'size']