In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import pydataset
from sklearn.feature_selection import f_regression, SelectKBest, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Load the tips dataset

In [5]:
tips = pydataset.data('tips')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.009947
2,10.34,1.66,Male,No,Sun,Dinner,3,0.006054
3,21.01,3.5,Male,No,Sun,Dinner,3,0.012301
4,23.68,3.31,Male,No,Sun,Dinner,2,0.013864
5,24.59,3.61,Female,No,Sun,Dinner,4,0.014397


### a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [6]:
tips['price_per_person'] = tips.total_bill / tips.size
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.008704
2,10.34,1.66,Male,No,Sun,Dinner,3,0.005297
3,21.01,3.5,Male,No,Sun,Dinner,3,0.010763
4,23.68,3.31,Male,No,Sun,Dinner,2,0.012131
5,24.59,3.61,Female,No,Sun,Dinner,4,0.012597


### b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? 

I'd say that the features that most effect the target is the total_bill and tip amounts.

### c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [8]:
X = tips[['size', 'total_bill', 'price_per_person']]
y = tips.tip

In [9]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X, y)
print('Top 2 features from k-best:')
X.columns[kbest.get_support()]

Top 2 features from k-best:


Index(['total_bill', 'price_per_person'], dtype='object')

### d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [10]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2).fit(X, y)
print('Top 2 features according to RFE:')
X.columns[rfe.get_support()]

Top 2 features according to RFE:


Index(['size', 'total_bill'], dtype='object')

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [13]:
def select_kbest(X, y, k):
    kbest = SelectKBest(k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]

In [14]:
select_kbest(tips[['total_bill', 'price_per_person', 'size']], tips.tip, 2)

Index(['total_bill', 'price_per_person'], dtype='object')

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [15]:
def rfe(X, y, k, model=LinearRegression()):
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    mask = rfe.get_support()
    return X.columns[mask]

In [16]:
rfe(tips[['total_bill', 'price_per_person', 'size']], tips.tip, 2)

Index(['total_bill', 'size'], dtype='object')

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [17]:
swiss = pydataset.data('swiss')
train_validate, test = train_test_split(swiss, random_state=123, test_size=.2)
train, validate = train_test_split(train_validate, random_state=123, test_size=.2)

print(f'train: {train.shape[0]} | validate: {validate.shape[0]} | test: {test.shape[0]}')

train: 29 | validate: 8 | test: 10


In [18]:
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

In [19]:
select_kbest(X_train, y_train, 3)

Index(['Education', 'Catholic', 'Infant.Mortality'], dtype='object')

In [20]:
rfe(X_train, y_train, 3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')