# Feature_Engineering Exercises
Do your work for this exercise in a jupyter notebook named ```feature_engineering``` within the ```regression-exercises``` repo. Add, commit, and push your work.

In [70]:
# standardized modules
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Decision Tree, Model & Feature Evaluation Imports
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

#my modules
import QMCBT_wrangle as w
import QMCBT_explore_evaluate as ee
#import QMCBT_[00]quick_tips as tips
#import QMCBT_[01]acquire as acquire
#import QMCBT_[02]prepare as prepare
#import QMCBT_[03]explore as explore
#import QMCBT_[04]evaluate as evaluate

# 1. Load the tips dataset.

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [3]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   party_size  244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


<div class="alert alert-warning">


## Check WhiteSpace
    
</div>

In [33]:
# Return (row count)
row_count = df.shape[0]
row_count

244

In [35]:
# creates list of columns
column_list = df.columns
column_list

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'party_size',
       'price_per_person'],
      dtype='object')

In [36]:
# returns count of all rows from each column that has values
# essentially skipping count on any row that has a column with whitespace
row_value_count = df[column_list].value_counts().sum()
row_value_count

244

In [37]:
# subtract value count from row count to get count of rows with whitespace
whitespace_count = row_count - row_value_count
whitespace_count

0

<div class="alert alert-warning">


## Convert Categorical
* sex
* smoker
* day
* time
    
</div>

In [38]:
df.sex.unique()

['Female', 'Male']
Categories (2, object): ['Male', 'Female']

In [39]:
df.smoker.unique()

['No', 'Yes']
Categories (2, object): ['Yes', 'No']

In [40]:
df.day.unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [41]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [42]:
df.sex = df.sex.replace('Male', 1)
df.sex = df.sex.replace('Female', 0)

df.smoker = df.smoker.replace('Yes', 1)
df.smoker = df.smoker.replace('No', 0)

df.time = df.time.replace('Dinner', 1)
df.time = df.time.replace('Lunch', 0)

In [43]:
df.head().T

Unnamed: 0,0,1,2,3,4
total_bill,16.99,10.34,21.01,23.68,24.59
tip,1.01,1.66,3.5,3.31,3.61
sex,0,1,1,1,0
smoker,0,0,0,0,0
day,Sun,Sun,Sun,Sun,Sun
time,1,1,1,1,1
party_size,2,3,3,2,4
price_per_person,8.49,3.45,7.0,11.84,6.15


<div class="alert alert-warning">

    
## Get Dummies

In [44]:
dummy_df = pd.get_dummies(data=df[['day']], drop_first=False)
dummy_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,234,235,236,237,238,239,240,241,242,243
day_Thur,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
day_Fri,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
day_Sat,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
day_Sun,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [45]:
df = pd.concat([df, dummy_df], axis=1)

<div class="alert alert-warning">

    
## Rename Columns

In [46]:
df = df.rename(columns={"size": "party_size", "sex": "male_female", "time": "dinner_lunch"})

<div class="alert alert-warning">

    
## Drop Columns

In [48]:
df = df.drop(columns=['day'])

In [49]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,234,235,236,237,238,239,240,241,242,243
total_bill,16.99,10.34,21.01,23.68,24.59,25.29,8.77,26.88,15.04,14.78,...,15.53,10.07,12.6,32.83,35.83,29.03,27.18,22.67,17.82,18.78
tip,1.01,1.66,3.5,3.31,3.61,4.71,2.0,3.12,1.96,3.23,...,3.0,1.25,1.0,1.17,4.67,5.92,2.0,2.0,1.75,3.0
male_female,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
smoker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
dinner_lunch,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
party_size,2.0,3.0,3.0,2.0,4.0,4.0,2.0,4.0,2.0,2.0,...,2.0,2.0,2.0,2.0,3.0,3.0,2.0,2.0,2.0,2.0
price_per_person,8.49,3.45,7.0,11.84,6.15,6.32,4.38,6.72,7.52,7.39,...,7.76,5.04,6.3,16.42,11.94,9.68,13.59,11.34,8.91,9.39
day_Thur,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
day_Fri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
day_Sat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   total_bill        244 non-null    float64 
 1   tip               244 non-null    float64 
 2   male_female       244 non-null    category
 3   smoker            244 non-null    category
 4   dinner_lunch      244 non-null    category
 5   party_size        244 non-null    int64   
 6   price_per_person  244 non-null    float64 
 7   day_Thur          244 non-null    uint8   
 8   day_Fri           244 non-null    uint8   
 9   day_Sat           244 non-null    uint8   
 10  day_Sun           244 non-null    uint8   
dtypes: category(3), float64(3), int64(1), uint8(4)
memory usage: 9.8 KB


<div class="alert alert-warning">

    
## Change dtype

In [52]:
df = df.astype({"male_female": int, "smoker": int, "dinner_lunch": int})

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   male_female       244 non-null    int64  
 3   smoker            244 non-null    int64  
 4   dinner_lunch      244 non-null    int64  
 5   party_size        244 non-null    int64  
 6   price_per_person  244 non-null    float64
 7   day_Thur          244 non-null    uint8  
 8   day_Fri           244 non-null    uint8  
 9   day_Sat           244 non-null    uint8  
 10  day_Sun           244 non-null    uint8  
dtypes: float64(3), int64(4), uint8(4)
memory usage: 14.4 KB


* ## a. Create a column named ```price_per_person```. This should be the ```total bill``` divided by the ```party size```.

#### Feature Engineer

In [6]:
# Create price_per_person feature
df['price_per_person'] = round((df.total_bill / df.party_size), 2)
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91


* ## b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the ```tip amount```?

<div class="alert alert-info">


### ANSWER:
* ### sex and party_size

* ## c. Use ```select k best``` to select the top 2 features for predicting ```tip amount```. What are they?

In [54]:
mycols = ['col1', 'col2', 'col3', 'col4']

In [55]:
######################### SPLIT DATA #########################

def split(df):
    """
    This Function splits the DataFrame into train, validate, and test
    then provides instructions and a mini report showing the shape of the original DataFrame
    compared to the shape of the train, validate, and test DataFrames.
    """
    
    # Split df into train and test using sklearn
    train, test = train_test_split(df, test_size=.2, random_state=1992)

    # Split train_df into train and validate using sklearn
    # Do NOT stratify on continuous data
    train, validate = train_test_split(train, test_size=.25, random_state=1992)

    # reset index for train validate and test
    train.reset_index(drop=True, inplace=True)
    validate.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)

    print('_______________________________________________________________')
    print('|                              DF                             |')
    print('|-------------------:-------------------:---------------------|')
    print('|       Train       |       Validate    |          Test       |')
    print('|-------------------:-------------------:---------------------|')
    print('| x_train | y_train |   x_val  |  y_val |   x_test  |  y_test |')
    print(':-------------------------------------------------------------:')
    print('')
    print('* 1. tree_1 = DecisionTreeClassifier(max_depth = 5)')
    print('* 2. tree_1.fit(x_train, y_train)')
    print('* 3. predictions = tree_1.predict(x_train)')
    print('* 4. pd.crosstab(y_train, y_preds)')
    print('* 5. val_predictions = tree_1.predict(x_val)')
    print('* 6. pd.crosstab(y_val, y_preds)')

    print()
    print(f'Prepared df: {df.shape}')
    print()
    print(f'      Train: {train.shape} - {round((train.shape[0] / df.shape[0]), 2)}%')
    print(f'   Validate: {validate.shape} - {round((validate.shape[0] / df.shape[0]), 2)}%')
    print(f'       Test: {test.shape} - {round((test.shape[0] / df.shape[0]), 2)}%')
        
    return train, validate, test

In [56]:
train, validate, test = split(df)

_______________________________________________________________
|                              DF                             |
|-------------------:-------------------:---------------------|
|       Train       |       Validate    |          Test       |
|-------------------:-------------------:---------------------|
| x_train | y_train |   x_val  |  y_val |   x_test  |  y_test |
:-------------------------------------------------------------:

* 1. tree_1 = DecisionTreeClassifier(max_depth = 5)
* 2. tree_1.fit(x_train, y_train)
* 3. predictions = tree_1.predict(x_train)
* 4. pd.crosstab(y_train, y_preds)
* 5. val_predictions = tree_1.predict(x_val)
* 6. pd.crosstab(y_val, y_preds)

Prepared df: (244, 11)

      Train: (146, 11) - 0.6%
   Validate: (49, 11) - 0.2%
       Test: (49, 11) - 0.2%


In [57]:
train.head().T

Unnamed: 0,0,1,2,3,4
total_bill,23.33,16.32,18.64,14.15,15.42
tip,5.65,4.3,1.36,2.0,1.57
male_female,1.0,0.0,0.0,0.0,1.0
smoker,1.0,1.0,0.0,0.0,0.0
dinner_lunch,1.0,1.0,0.0,0.0,1.0
party_size,2.0,2.0,3.0,2.0,2.0
price_per_person,11.66,8.16,6.21,7.08,7.71
day_Thur,0.0,0.0,1.0,1.0,0.0
day_Fri,0.0,1.0,0.0,0.0,0.0
day_Sat,0.0,0.0,0.0,0.0,0.0


In [59]:
cols = train.columns
cols

Index(['total_bill', 'tip', 'male_female', 'smoker', 'dinner_lunch',
       'party_size', 'price_per_person', 'day_Thur', 'day_Fri', 'day_Sat',
       'day_Sun'],
      dtype='object')

In [60]:
X_train, y_train = train[['total_bill', 
                          'male_female', 
                          'smoker', 
                          'dinner_lunch',
                          'party_size', 
                          'price_per_person', 
                          'day_Thur', 
                          'day_Fri', 
                          'day_Sat',
                          'day_Sun']], train.tip

X_validate, y_validate = validate[['total_bill', 
                                   'male_female', 
                                   'smoker', 
                                   'dinner_lunch',
                                   'party_size', 
                                   'price_per_person', 
                                   'day_Thur', 
                                   'day_Fri', 
                                   'day_Sat',
                                   'day_Sun']], validate.tip

X_test, y_test = test[['total_bill', 
                       'male_female', 
                       'smoker', 
                       'dinner_lunch',
                       'party_size', 
                       'price_per_person', 
                       'day_Thur', 
                       'day_Fri', 
                       'day_Sat',
                       'day_Sun']], test.tip

<div class="alert alert-warning">


## Select K Best

- looks at each feature in isolation against the target based on correlation
- fastest of all approaches covered in this lesson
- doesn't consider feature interactions
- After fitting: `.scores_`, `.pvalues_`, `.get_support()`, and `.transform`

In [110]:
# make the thing
kbest = SelectKBest(f_regression, k=2)
# fit the thing
_ = kbest.fit(X_train, y_train)

In [111]:
# statistical f-value:
kbest.scores_
#p value: 
kbest.pvalues_

array([1.50592117e-16, 1.94705239e-01, 4.08889909e-01, 1.30330476e-01,
       4.09415406e-13, 1.65905928e-02, 2.08021754e-01, 9.55202509e-01,
       3.09523547e-02, 9.80962486e-04])

In [112]:
kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns)

In [113]:
kbest_results

Unnamed: 0,p,f
total_bill,1.505921e-16,87.537608
male_female,0.1947052,1.697421
smoker,0.4088899,0.686026
dinner_lunch,0.1303305,2.314923
party_size,4.094154e-13,63.743687
price_per_person,0.01659059,5.875783
day_Thur,0.2080218,1.599467
day_Fri,0.9552025,0.003167
day_Sat,0.03095235,4.748361
day_Sun,0.0009809625,11.323902


In [114]:
# get-support() will output a boolean mask to tell me which features were selected
# we can apply this mask to the columns in our original dataframe
X_train.columns[kbest.get_support()]

Index(['total_bill', 'party_size'], dtype='object')

In [115]:
# kbest transform will convert our information to the selected feature subspace
# ****buuuuuut, its just a numpy array
kbest.transform(X_train)[:5]

array([[23.33,  2.  ],
       [16.32,  2.  ],
       [18.64,  3.  ],
       [14.15,  2.  ],
       [15.42,  2.  ]])

In [116]:
X_train_transformed = pd.DataFrame(kbest.transform(X_train),
                                   columns=X_train.columns[kbest.get_support()],
                                   index=X_train.index)

In [117]:
X_train_transformed.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,136,137,138,139,140,141,142,143,144,145
total_bill,23.33,16.32,18.64,14.15,15.42,13.81,32.83,30.4,25.29,12.76,...,48.17,20.65,38.07,31.71,23.68,8.51,14.83,10.33,21.01,16.45
party_size,2.0,2.0,3.0,2.0,2.0,2.0,2.0,4.0,4.0,2.0,...,6.0,3.0,3.0,4.0,2.0,2.0,2.0,3.0,2.0,2.0


<div class="alert alert-info">


## ANSWER:
* total_bill & party_size
* Consider engineering features for tip_percent and tip_per_person

* ## d. Use ```recursive feature elimination``` to select the top 2 features for ```tip amount```. What are they?

<div class="alert alert-warning">


## RFE

- Recursive Feature Elimination
- Progressively eliminate features based on importance to the model
- Requires a model with either a `.coef_` or `.feature_importances_` property
- After fitting: `.ranking_`, `.get_support()`, and `.transform()`

In [118]:
# make a model object to use in RFE process.
# The model is here to give us metrics on feature importance and model score
# allowing us to recursively reduce the number of features to reach our desired space
model = LinearRegression()

In [119]:
# make thing
rfe = RFE(model, n_features_to_select=2)
# fit thing
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [120]:
rfe.ranking_

array([7, 6, 5, 1, 1, 4, 9, 3, 2, 8])

In [121]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index = X_train.columns)

Unnamed: 0,rfe_ranking
total_bill,7
male_female,6
smoker,5
dinner_lunch,1
party_size,1
price_per_person,4
day_Thur,9
day_Fri,3
day_Sat,2
day_Sun,8


In [122]:
rfe.get_support()

array([False, False, False,  True,  True, False, False, False, False,
       False])

In [123]:
X_train_transformed = pd.DataFrame(rfe.transform(X_train),
                                   index = X_train.index,
                                   columns = X_train.columns[rfe.support_])

In [124]:
X_train_transformed.head()

Unnamed: 0,dinner_lunch,party_size
0,1.0,2.0
1,1.0,2.0
2,0.0,3.0
3,0.0,2.0
4,1.0,2.0


<div class="alert alert-info">


## ANSWER:
* dinner_lunch & party_size

* ## e. Why do you think ```select k best``` and ```recursive feature elimination``` might give different answers for the top features? Does this change as you change the number of features you are selecting?

<div class="alert alert-info">


# ANSWER:   🤷‍♂️

# 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

# 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

# 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).