In [165]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pydataset import data
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression

# Feature Engineering

#### 1)
Load the tips dataset.

In [166]:
# Load the 'tips' dataset
# and store it in the 'df' DataFrame.
df = data('tips')

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


#### a)
Create a column named price_per_person. This should be the total bill divided by the party size.

In [167]:
# Calculate the 'price_per_person' by dividing the 'total_bill' by the 'size'
# and rounding the result to two decimal places.
df['price_per_person'] = round(df['total_bill'] / df['size'], 2)

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15


#### b) 
Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

total_bill, price_per_person and time

#### c)
Use Select K Best to select the top 2 features for predicting tip amount. What are they?

In [168]:
# Create dummy variables for the categorical columns 'sex', 'smoker', and 'time'.
df = pd.get_dummies(df, columns=['sex', 'smoker', 'time'], drop_first=True)

# Create dummy variables for the categorical column 'day' without dropping the first category.
df = pd.get_dummies(df, columns=['day'])

In [169]:
# Create an instance of the MinMaxScaler.
mms = MinMaxScaler()

# Specify the columns to be scaled (excluding the 'tip' column in this case).
to_scale = df.drop(columns=['tip']).columns

# Use the MinMaxScaler to fit and transform the specified columns, scaling them between 0 and 1.
df[to_scale] = mms.fit_transform(df[to_scale])

df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Male,smoker_Yes,time_Lunch,day_Fri,day_Sat,day_Sun,day_Thur
1,0.291579,1.01,0.2,0.322599,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.152283,1.66,0.4,0.032777,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.375786,3.5,0.4,0.236918,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.431713,3.31,0.2,0.515239,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.450775,3.61,0.6,0.188039,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [170]:
# Create the feature variables X by dropping the 'tip' column.
X = df.drop(columns=['tip'])

# Create the target variable y, which contains the 'tip' column.
y = df['tip']

In [171]:
# Create an instance of SelectKBest with the f_regression scoring function and select the top 2 features.
skb = SelectKBest(score_func=f_regression, k=2)

# Fit the SelectKBest instance to your feature variables (X) and target variable (y).
skb.fit(X, y)

In [172]:
# Get a Boolean mask indicating the selected features by SelectKBest.
skb_mask = skb.get_support()

# Extract the names of the selected features from the original feature variable names (X.columns).
X.columns[skb_mask]

Index(['total_bill', 'size'], dtype='object')

#### d)
Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?

In [173]:
# Create an instance of the Linear Regression model.
lm = LinearRegression()

# Create an instance of Recursive Feature Elimination (RFE) with the Linear Regression model
# and specify that you want to select the top 2 features.
rfe = RFE(estimator=lm, n_features_to_select=2)

# Fit the RFE instance to your feature variables (X) and target variable (y).
rfe.fit(X, y)

In [174]:
# Get a Boolean mask indicating the selected features by RFE.
rfe_mask = rfe.get_support()

# Extract the names of the selected features from the original feature variable names (X.columns).
X.columns[rfe_mask]

Index(['total_bill', 'price_per_person'], dtype='object')

#### e)
Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

 SelectKBest ranks features based on their scores and selects the top-k features with the highest scores. RFE, on the other hand, performs a recursive process of feature elimination. 

#### 2)
  Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually?

In [175]:
def select_kbest(X, y, k):
    # Create an instance of SelectKBest with the f_regression scoring function and select the top 2 features.
    skb = SelectKBest(score_func=f_regression, k=k)
    
    # Fit the SelectKBest instance to your feature variables (X) and target variable (y).
    skb.fit(X, y)

    # Get a Boolean mask indicating the selected features by SelectKBest.
    skb_mask = skb.get_support()

    return pd.DataFrame(X.columns[skb_mask].tolist())

In [176]:
#calling the Select K Best function and k=2
select_kbest(X, y, k=2)

Unnamed: 0,0
0,total_bill
1,size


#### 3) 
Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [177]:
def rfe(X, y, n):
    # Create an instance of the Linear Regression model.
    lm = LinearRegression()
    
    # Create an instance of Recursive Feature Elimination (RFE) with the Linear Regression model
    # and specify that you want to select the top 2 features.
    rfe = RFE(estimator=lm, n_features_to_select=n)
    
    # Fit the RFE instance to your feature variables (X) and target variable (y).
    rfe.fit(X, y)

    # Get a Boolean mask indicating the selected features by RFE.
    rfe_mask = rfe.get_support()
    
    return pd.DataFrame(X.columns[rfe_mask])

In [178]:
#calling the Recursive Feature Elimination function and n=2.
rfe(X, y, n=2)

Unnamed: 0,0
0,total_bill
1,price_per_person


#### 4)
  Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [179]:
# Load the 'swiss' dataset
# and store it in the 'df' DataFrame.
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


#### SelectKBest

In [180]:
# Create the feature variables X by dropping the 'Fertility' column.
X=df.drop(columns=['Fertility'])

# Create the target variable y, which contains the 'Fertility' column.
y=df['Fertility']

In [181]:
#calling the Select K Best function and k=3
select_kbest( X, y, 3)

Unnamed: 0,0
0,Examination
1,Education
2,Catholic


#### Recursive Feature Elimination

In [182]:
# Create the feature variables X by dropping the 'Fertility' column.
X=df.drop(columns=['Fertility'])

# Create the target variable y, which contains the 'Fertility' column.
y=df['Fertility']

In [183]:
#calling the Recursive Feature Elimination function and n=3
rfe(X, y, 3)

Unnamed: 0,0
0,Examination
1,Education
2,Infant.Mortality
