## Feature Engineering

In [1]:
#tabular data imports :
import pandas as pd
import numpy as np
import env
from env import user, password, host
from pydataset import data

# visualization imports:
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

from scipy.stats import pearsonr, spearmanr
from scipy.stats import shapiro

import warnings
warnings.filterwarnings("ignore")
import wrangle as w
import explore as e
import os
directory = os.getcwd()

#### 1. Load the `tips` dataset.
- a. Create a column named `price_per_person`. This should be the total bill divided by the party size.

In [2]:
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [4]:
# Create the 'price_per_person' column
tips['price_per_person'] = tips['total_bill'] / tips['size']

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


### - b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
### I think the most important features are total_bill, size, and the new colomn

In [6]:
train, validate, test = w.split_data(tips)


    train -> (146, 8)
    validate -> (49, 8)
    test -> (49, 8)


In [7]:
# 1. create the object
scaler = sklearn.preprocessing.MinMaxScaler()

# 2. fit the object (learn the min and max value)
scaler.fit(train[['total_bill','price_per_person','size']])

In [8]:
train_scaled = scaler.transform(train[['total_bill','price_per_person','size']])
validate_scaled = scaler.transform(validate[['total_bill','price_per_person','size']])
test_scaled = scaler.transform(test[['total_bill','price_per_person','size']])

In [9]:
# assign the scaled values as new columns in the train

train[['total_bill_scaled','price_per_person_scaled','size_scaled']] = train_scaled
validate[['total_bill_scaled','price_per_person_scaled','size_scaled']] = validate_scaled
test[['total_bill_scaled','price_per_person_scaled','size_scaled']] = test_scaled

In [10]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,total_bill_scaled,price_per_person_scaled,size_scaled
37,16.31,2.0,Male,No,Sat,Dinner,3,5.436667,0.29292,0.177401,0.4
218,11.59,1.5,Male,Yes,Sat,Dinner,2,5.795,0.188496,0.202216,0.2
64,18.29,3.76,Male,Yes,Sat,Dinner,4,4.5725,0.336726,0.117555,0.6
29,21.7,4.3,Male,No,Sat,Dinner,2,10.85,0.412168,0.552285,0.2
14,18.43,3.0,Male,No,Sun,Dinner,4,4.6075,0.339823,0.119979,0.6


In [11]:
X_train, y_train = train[['total_bill_scaled','price_per_person_scaled','size_scaled']], train['tip']
X_validate, y_validate = validate[['total_bill_scaled','price_per_person_scaled','size_scaled']], validate['tip']
X_test, y_test = test[['total_bill_scaled','price_per_person_scaled','size_scaled']], test['tip']

## Select K Best
### - c. Use Select K Best to select the top 2 features for predicting tip amount. What are they?

In [12]:
# MAKE the thing
kbest = SelectKBest(f_regression, k=2)

# FIT the thing
kbest.fit(X_train, y_train)

In [13]:
# statistical f-value / feature's scores:
kbest.scores_

array([91.43963955, 25.67556929, 29.299741  ])

In [14]:
# p value: 
kbest.pvalues_

array([4.46252410e-17, 1.21961941e-06, 2.52960400e-07])

In [15]:
kbest.feature_names_in_

array(['total_bill_scaled', 'price_per_person_scaled', 'size_scaled'],
      dtype=object)

In [16]:
kbest_results = pd.DataFrame(
                dict(p=kbest.pvalues_, f=kbest.scores_),
                                        index = X_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill_scaled,4.4625240000000004e-17,91.43964
price_per_person_scaled,1.219619e-06,25.675569
size_scaled,2.529604e-07,29.299741


In [17]:
# we can apply this mask to the columns in our original dataframe
X_train.columns[kbest.get_support()]

Index(['total_bill_scaled', 'size_scaled'], dtype='object')

### Takeaways:
- total_bill is the best feature with the highest feature score
- size is the second best feature

## RFE
### - d. Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?

In [18]:
# make a model object to use in RFE process.
# The model is here to give us metrics on feature importance and model score
# allowing us to recursively reduce the number of features to reach our desired space
model = LinearRegression()
# MAKE the thing
rfe = RFE(model, n_features_to_select=2)

# FIT the thing
rfe.fit(X_train, y_train)

In [19]:
# Get feature ranking
# Selected features are assigned a rank 1

rfe.ranking_

array([1, 2, 1])

In [20]:
pd.DataFrame(
{
    'rfe_ranking':rfe.ranking_
}, index = X_train.columns)

Unnamed: 0,rfe_ranking
total_bill_scaled,1
price_per_person_scaled,2
size_scaled,1


### Takeaways:
- total_bill and size are the best features with the highest feature score of 1


### - e. Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? 
#### Does this change as you change the number of features you are selecting?
  - the reason they might have different "best 2" might have to be related to the algorithm that goes on in the background.

#### 2. Write a function named `select_kbest` that takes in the predictors (X), the target (y), and the number of features to select (`k`) and returns the names of the top `k` selected features based on the `SelectKBest` class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [21]:
def select_kbest(X, y, k):
    '''
    X: pd.DataFrame; Scaled features
    y: pd.DataFrame; Scaled target
    k: int; number of features to return
    
    Returns a list of the column names that are the k best features
    '''
    f_selector = SelectKBest(f_regression, k=k)
    f_selector.fit(X, y)

    f_support = f_selector.get_support()
    f_feature = X.loc[:,f_support].columns.tolist()
    return f_feature

In [22]:
select_kbest(X_train, y_train, 2)

['total_bill_scaled', 'size_scaled']

#### 3. Write a function named `rfe` that takes in the predictors, the target, and the number of features to select. It should return the top `n` features based on the `RFE` class. Test your function with the `tips` dataset. You should see the same results as when you did the process manually.

In [24]:
def rfe(x, y, k):
    '''
    x: pd.DataFrame; Scaled features
    y: pd.Series; Scaled target
    k: int; number of features to select
    
    Returns a list of the column names that are the k best features
    '''
    lm = LinearRegression()
    rfe = RFE(estimator=lm, n_features_to_select=k)  # Use the parameter n_features_to_select
    rfe.fit(x, y)

    rfe_features = x.columns[rfe.support_].tolist()
    return rfe_features

In [25]:
rfe(X_train, y_train, 2)

['total_bill_scaled', 'size_scaled']

#### 4. Load the `swiss` dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [27]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [28]:
swiss.shape

(47, 6)

In [29]:
train, validate, test = w.split_data(swiss)


    train -> (28, 6)
    validate -> (9, 6)
    test -> (10, 6)


In [30]:
# 1. create the object
scaler = sklearn.preprocessing.MinMaxScaler()

# 2. fit the object (learn the min and max value)
scaler.fit(train[['Agriculture','Examination','Education','Catholic','Infant.Mortality']])

In [31]:
train_scaled = scaler.transform(train[['Agriculture','Examination','Education','Catholic','Infant.Mortality']])
validate_scaled = scaler.transform(validate[['Agriculture','Examination','Education','Catholic','Infant.Mortality']])
test_scaled = scaler.transform(test[['Agriculture','Examination','Education','Catholic','Infant.Mortality']])

In [32]:
# assign the scaled values as new columns in the train

train[['Agriculture_scaled','Examination_scaled','Education_scaled','Catholic_scaled','Infant.Mortality_scaled']] = train_scaled
validate[['Agriculture_scaled','Examination_scaled','Education_scaled','Catholic_scaled','Infant.Mortality_scaled']] = validate_scaled
test[['Agriculture_scaled','Examination_scaled','Education_scaled','Catholic_scaled','Infant.Mortality_scaled']] = test_scaled

In [33]:
X_train, y_train = train[['Agriculture_scaled','Examination_scaled','Education_scaled','Catholic_scaled','Infant.Mortality_scaled']], train['Fertility']
X_validate, y_validate = validate[['Agriculture_scaled','Examination_scaled','Education_scaled','Catholic_scaled','Infant.Mortality_scaled']], validate['Fertility']
X_test, y_test = test[['Agriculture_scaled','Examination_scaled','Education_scaled','Catholic_scaled','Infant.Mortality_scaled']], test['Fertility']

In [34]:
select_kbest(X_train, y_train, 3)

['Examination_scaled', 'Education_scaled', 'Catholic_scaled']

In [35]:
rfe(X_train, y_train, 3)

['Examination_scaled', 'Education_scaled', 'Infant.Mortality_scaled']