# 1. Load the tips dataset.

In [42]:
import prepare

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings("ignore")

In [2]:
tips = sns.load_dataset('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [5]:
tips['tip_percentage'] = tips.tip / tips.total_bill

### Create a column named price_per_person. This should be the total bill divided by the party size.

In [6]:
tips['price_per_person'] = tips.total_bill / tips.size

### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

- amount of total bill: the higher the total, the bigger the tip
- day of the week: people more likely to tip more on a friday or saturday
- size: more people means more that can tip

In [21]:
print(tips.sex.value_counts())
print(tips.day.value_counts())
print(tips.time.value_counts())
print(tips.smoker.value_counts())

Male      157
Female     87
Name: sex, dtype: int64
Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64
Dinner    176
Lunch      68
Name: time, dtype: int64
No     151
Yes     93
Name: smoker, dtype: int64


In [24]:
# converting sex to numeric
tips = tips.replace({'Female':1, 'Male':0,
              'Yes':1, 'No':0,
              'Dinner':1, 'Lunch':0,
              'Thur':0,'Fri':1,'Sat':2,'Sun':3})

In [25]:
# Splitting the data
train, validate, test = prepare.train_validate_test(tips)

train shape:  (175, 9) , validate shape:  (44, 9) , test shape:  (25, 9)

train percent:  72.0 , validate percent:  18.0 , test percent:  10.0


In [26]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,1,0,3,1,2,0.059447,0.008704
1,10.34,1.66,0,0,3,1,3,0.160542,0.005297
2,21.01,3.5,0,0,3,1,3,0.166587,0.010763
3,23.68,3.31,0,0,3,1,2,0.13978,0.012131
4,24.59,3.61,1,0,3,1,4,0.146808,0.012597


In [27]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,-2.527302,-0.621178,-5.199338,-5.199338,-5.199338,-5.199338,-0.446588,1.899709,-2.527302
1,-0.94467,-1.3116,-5.199338,5.199338,-0.578068,5.199338,-0.446588,-0.797282,-0.94467
2,-5.199338,-5.199338,5.199338,5.199338,-0.007203,5.199338,-5.199338,2.527302,-5.199338
3,0.879168,1.441564,-5.199338,5.199338,-0.007203,5.199338,1.215598,0.922396,0.879168
4,-0.028816,-0.086543,-5.199338,-5.199338,5.199338,5.199338,-0.446588,-0.292225,-0.028816


### Use all the other numeric features to predict tip amount. Use select k best to select the top 2 features. What are they?

In [33]:
X_train_tip = train.drop(columns='tip')
y_train_tip = train.tip

In [35]:
f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(X_train_tip, y_train_tip)

X_reduced = f_selector.transform(X_train_tip)

print('Shape of X Train:', X_train_tip.shape)
print('Shape of y Train:', X_reduced.shape)

f_support = f_selector.get_support()

print('\nF-support:', f_support) 

f_feature = X_train_tip.loc[:,f_support].columns.tolist()

print('\n',str(len(f_feature)), 'selected features:')
print(f_feature)

Shape of X Train: (175, 8)
Shape of y Train: (175, 2)

F-support: [ True False False False False False False  True]

 2 selected features:
['total_bill', 'price_per_person']


### Use RSE

In [44]:
lm = LinearRegression()
rfe = RFE(lm, 2)

# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train_tip,y_train_tip)

#Fitting the data to model
lm.fit(X_rfe,y_train_tip)

mask = rfe.support_

rfe_features = X_train_tip.loc[:,mask].columns.tolist()

print(str(len(rfe_features)), 'selected features')
print(rfe_features)

var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()

pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

2 selected features
['total_bill', 'price_per_person']


Unnamed: 0,Var,Rank
0,total_bill,1
1,sex,7
2,smoker,4
3,day,6
4,time,5
5,size,3
6,tip_percentage,2
7,price_per_person,1


### Use all the other numeric features to predict tip percentage. Use select k to select the top 2 features. What are they?

In [39]:
X_train_pcnt = tips.drop(columns='tip_percentage')
y_train_pcnt = tips.tip_percentage

In [41]:
f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(X_train_pcnt, y_train_pcnt)

X_reduced = f_selector.transform(X_train_pcnt)

print('Shape of X Train:', X_train_pcnt.shape)
print('Shape of y Train:', X_reduced.shape)

f_support = f_selector.get_support()

print('\nF-support:', f_support) 

f_feature = X_train_pcnt.loc[:,f_support].columns.tolist()

print('\n',str(len(f_feature)), 'selected features:')
print(f_feature)

Shape of X Train: (244, 8)
Shape of y Train: (244, 2)

F-support: [False  True False False False False False  True]

 2 selected features:
['tip', 'price_per_person']


### Use RSE

In [45]:
lm = LinearRegression()
rfe = RFE(lm, 2)

# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train_pcnt,y_train_pcnt)

#Fitting the data to model
lm.fit(X_rfe,y_train_pcnt)

mask = rfe.support_

rfe_features = X_train_pcnt.loc[:,mask].columns.tolist()

print(str(len(rfe_features)), 'selected features')
print(rfe_features)

var_ranks = rfe.ranking_
var_names = X_train.columns.tolist()

pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

2 selected features
['tip', 'time']


Unnamed: 0,Var,Rank
0,total_bill,4
1,sex,1
2,smoker,5
3,day,3
4,time,2
5,size,1
6,tip_percentage,6
7,price_per_person,7


### Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

### Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function 

# 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

# 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

# 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).