In [1]:
from pydataset import data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import sklearn.linear_model
import sklearn.feature_selection
import sklearn.preprocessing

In [2]:
# 1. Load the tips dataset.

df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


## 1a
Create a column named price_per_person. This should be the total bill divided by the party size.

In [3]:
df["tip_percentage"] = df.tip / df.total_bill
df.drop(columns=['sex','smoker','day','time'], inplace = True)

## 1b.
Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
- Tip amount: total price
- Tip percentage: party size

In [4]:
df["price_per_person"] = df.total_bill / df.size

## 1c
Use select k best to select the top 2 features for predicting tip amount. What are they?

In [7]:
features = "tip"

X = df.drop(columns=[features])
y = df[features]

In [5]:
k = 2

kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)

kbest.fit(X, y)

kbest_features = X.columns[kbest.get_support()].tolist()

kbest_features

## 1d.
Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [8]:

k = 2

kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)

kbest.fit(X, y)

kbest_features = X.columns[kbest.get_support()].tolist()

kbest_features

['total_bill', 'price_per_person']

## RFE 

In [9]:
k = 2

kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)

kbest.fit(X, y)

kbest_features = X.columns[kbest.get_support()].tolist()

kbest_features

['total_bill', 'price_per_person']

## 1E 
Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [None]:
features = "tip_percentage"

X = df.drop(columns=[features])
y = df[features]

In [None]:
k = 2

kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)

kbest.fit(X, y)

kbest_features = X.columns[kbest.get_support()].tolist()

kbest_features

## RFE

In [None]:
## Efeatures = "tip_percentage"

eatures = "tip_percentage"

X = df.drop(columns=[features])
y = df[features]

lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=2)

rfe.fit(X, y)

rfe_columns = X.columns[rfe.support_].tolist()
rfe_columns

RFE eliminates the weakest features while kbest is looking for the strongest relationships for x and y

# 2
Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [10]:
def select_kbest(X, y, k):
    '''
    Takes predictors, target, and number to select and returns the names of the top k selected features.
    '''
    #Make the object
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=k)
    #Fit the object
    kbest.fit(X, y)
    #Return the resulting list
    return X.columns[kbest.get_support()].tolist()

# 3
Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [17]:
def features_rankings(X_train, rfe):
    """
    Takes in a dataframe and a fit RFE object in order to output the rank of all features
    """
    # rfe here is reference rfe from cell 15
    var_ranks = rfe.ranking_
    var_names = X_train.columns.tolist()
    ranks = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
    ranks = ranks.sort_values(by="Rank", ascending=True)
    return ranks

In [18]:
def select_rfe(X, y, k):
    '''
    Takes predicotrs, target, and number to select and returns the top k features based on RFE class and
    a ranking for features
    '''
    #Make the model and rfe objects
    lm = sklearn.linear_model.LinearRegression()
    rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=k)

    #Fit the rfe object to features
    rfe.fit(X, y)
    
    # use the rfe object
    features = X.columns[rfe.support_].tolist()
    
    #create ranking
    rankings = features_rankings(X, rfe)
    
    return features, rankings

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [19]:
# swiss dataset 

swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [20]:
features = 'Fertility'
X = swiss.drop(columns=[features])
y = swiss[features]

In [21]:
select_kbest(X, y, 3)

['Examination', 'Education', 'Catholic']

In [22]:
features, rankings = select_rfe(X, y, 3)
print(features)
rankings

['Examination', 'Education', 'Infant.Mortality']


Unnamed: 0,Var,Rank
1,Examination,1
2,Education,1
4,Infant.Mortality,1
0,Agriculture,2
3,Catholic,3
