# Feature Engineering Exercises

In [64]:
import pandas as pd
import numpy as np
import wrangle
import warnings
from pydataset import data
warnings.filterwarnings("ignore")
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

#### 1. Load the `tips` dataset 
    a. Create a column named price_per_person. This should be the total bill divided by the party size. \
    b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? \
    c. Use select k best to select the top 2 features for predicting tip amount. What are they? \
    d. Use recursive feature elimination to select the top 2 features for tip amount. What are they? \
    e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [3]:
tips = data('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


a. Create a column named price_per_person. This should be the total bill divided by the party size. 

In [17]:
tips['price_per_person'] = tips["total_bill"] / tips["size"]

In [35]:
cleanup_nums = {"sex": {"Male": 1, "Female": 0}, "smoker": {"Yes": 1, "No": 0}}

In [36]:
tips = tips.replace(cleanup_nums)

In [72]:
tips = pd.get_dummies(tips, columns=['day','time','size'])

In [73]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,price_per_person,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
1,16.99,1.01,0,0,8.495,0,0,1,0,1,0,0,1,0,0,0,0
2,10.34,1.66,1,0,3.446667,0,0,1,0,1,0,0,0,1,0,0,0
3,21.01,3.5,1,0,7.003333,0,0,1,0,1,0,0,0,1,0,0,0
4,23.68,3.31,1,0,11.84,0,0,1,0,1,0,0,1,0,0,0,0
5,24.59,3.61,0,0,6.1475,0,0,1,0,1,0,0,0,0,1,0,0


b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? 

I would predict `total_bill`, `sex`, and `size` would be most important for predicting the tip amount.

c. Use select k best to select the top 2 features for predicting tip amount. What are they? 

In [34]:
tips.smoker.value_counts()

No     151
Yes     93
Name: smoker, dtype: int64

In [19]:
tips.shape

(244, 8)

In [74]:
x_train = tips.drop(columns=['tip'], axis=1)
y_train = tips['tip']

In [75]:
x_train.head()

Unnamed: 0,total_bill,sex,smoker,price_per_person,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size_1,size_2,size_3,size_4,size_5,size_6
1,16.99,0,0,8.495,0,0,1,0,1,0,0,1,0,0,0,0
2,10.34,1,0,3.446667,0,0,1,0,1,0,0,0,1,0,0,0
3,21.01,1,0,7.003333,0,0,1,0,1,0,0,0,1,0,0,0
4,23.68,1,0,11.84,0,0,1,0,1,0,0,1,0,0,0,0
5,24.59,0,0,6.1475,0,0,1,0,1,0,0,0,0,1,0,0


In [76]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(x_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fa3a4ad59d0>)

In [77]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=x_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill,6.692471e-34,203.357723
sex,0.1664562,1.926155
smoker,0.9265932,0.008506
price_per_person,2.502102e-08,33.213257
day_Fri,0.38837,0.746727
day_Sat,0.9654161,0.001884
day_Sun,0.05094012,3.84839
day_Thur,0.135324,2.245302
time_Dinner,0.05780153,3.633815
time_Lunch,0.05780153,3.633815


In [78]:
x_train_transformed = pd.DataFrame(
    kbest.transform(x_train),
    index=x_train.index,
    columns=x_train.columns[kbest.get_support()]
)
x_train_transformed.head()

Unnamed: 0,total_bill,size_2
1,16.99,1.0
2,10.34,0.0
3,21.01,0.0
4,23.68,1.0
5,24.59,0.0


The top 2 features selected by KBest are:
- `total_bill`
- `size_2`

d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [79]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(x_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [80]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=x_train.columns)

Unnamed: 0,rfe_ranking
total_bill,4
sex,13
smoker,8
price_per_person,6
day_Fri,5
day_Sat,11
day_Sun,3
day_Thur,10
time_Dinner,9
time_Lunch,12


In [81]:
x_train_transformed = pd.DataFrame(
    rfe.transform(x_train),
    index=x_train.index,
    columns=x_train.columns[rfe.support_]
)
x_train_transformed.head()

Unnamed: 0,size_2,size_6
1,1.0,0.0
2,0.0,0.0
3,0.0,0.0
4,1.0,0.0
5,0.0,0.0


The top 2 features selected by RFE are:
- `size_2`
- `size_6`

e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

#### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. 

##### Creating 'for loop's, iterating through values of k, for kbest and rfe

In [None]:
def kbest_loop(x_train, y_train, ):
    

In [None]:
feature_sets_to_model = []
    
for i in range(1,10)
    # Create kbest object
    feature_set = SelectKBest(f_regression, k=i)
    feature_sets_to_model.append(feature_set)
        
        
        

In [None]:
eval_dict = {}

for feature_set in feature_sets_to_model:
    

In [None]:
        # Fit object to df
        kbest.fit(x_train, y_train)
        
        # Use the model
        kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=x_train.columns)
        kbest_results
        
        #
        x_train_transformed = pd.DataFrame(
        kbest.transform(x_train),
        index=x_train.index,
        columns=x_train.columns[kbest.get_support()]
        )
        x_train_transformed.head()
        
        output = {
            "k": i,
            "p_value":
            "f_value":
        }

In [None]:
feature_sets_to_model = []