In [57]:
import gen_wrangle as wrg
import scipy.stats as stats
import pandas as pd
import os
import numpy as np

# Data viz:
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn stuff:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression

import env

In [58]:
from pydataset import data
tips = data("tips")


In [59]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [60]:
#size and total_bill

In [61]:
tips["price_per_person"] = tips['total_bill'] / tips["size"]

In [62]:
tips_dummy=pd.get_dummies(tips[["day", "time"]], drop_first=True)


In [63]:
tips = pd.concat([tips, tips_dummy], axis=1) #make a function to dummy everything

In [64]:
tips["smoker_encoded"] = tips.smoker.map({"No":0, "Yes":1})
tips["sex_encoded"] = tips.sex.map({"Male":1, "Female":0}) 

In [65]:
train_tips, validate_tips, test_tips = wrg.split_function_cont_target(tips)

In [66]:
train_tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,day_Sat,day_Sun,day_Thur,time_Lunch,smoker_encoded,sex_encoded
225,13.42,1.58,Male,Yes,Fri,Lunch,2,6.71,0,0,0,1,1,1
182,23.33,5.65,Male,Yes,Sun,Dinner,2,11.665,0,1,0,0,1,1
103,44.3,2.5,Female,Yes,Sat,Dinner,3,14.766667,1,0,0,0,1,0
165,17.51,3.0,Female,Yes,Sun,Dinner,2,8.755,0,1,0,0,1,0
74,25.28,5.0,Female,Yes,Sat,Dinner,2,12.64,1,0,0,0,1,0


In [67]:
x_train, x_validate = train_tips[['total_bill', 'size', "day_Sat" , "day_Sun", "day_Thur" , "time_Lunch", "smoker_encoded", "sex_encoded" , "price_per_person"]], validate_tips[['total_bill', 'size', "day_Sat" , "day_Sun", "day_Thur" , "time_Lunch", "smoker_encoded", "sex_encoded" , "price_per_person"]]

y_train, y_validate = train_tips.tip, validate_tips.tip

In [68]:
x_train.shape

(146, 9)

In [69]:
#scaling

In [70]:
#funciton to scale

In [71]:
def scale_data(train, validate, to_scale):
    #make copies for scaling
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    #test_scaled = test.copy()

    #this scales stuff 
    #make the thing
    scaler = MinMaxScaler()

    #fit the thing
    scaler.fit(train[to_scale])

    #use the thing
    train_scaled[to_scale] = scaler.transform(train[to_scale])
    validate_scaled[to_scale] = scaler.transform(validate[to_scale])
    #test_scaled[to_scale] = scaler.transform(test[to_scale])
    
    return train_scaled, validate_scaled, #test_scaled

#add in the test stuff. No modeling for this one so no test added 


In [72]:
to_scale=x_train.columns.tolist()

In [73]:
#calls the scale funciton

In [74]:
x_train_scaled, X_validate_scaled = scale_data(x_train, x_validate,to_scale)

In [75]:
x_train_scaled.head()

Unnamed: 0,total_bill,size,day_Sat,day_Sun,day_Thur,time_Lunch,smoker_encoded,sex_encoded,price_per_person
225,0.228679,0.2,0.0,0.0,0.0,1.0,1.0,1.0,0.211566
182,0.447636,0.2,0.0,1.0,0.0,0.0,1.0,1.0,0.499564
103,0.910959,0.4,1.0,0.0,0.0,0.0,1.0,0.0,0.679841
165,0.319046,0.2,0.0,1.0,0.0,0.0,1.0,0.0,0.330427
74,0.49072,0.2,1.0,0.0,0.0,0.0,1.0,0.0,0.556234


In [76]:
# k best fit feature selection

In [77]:
kbest = SelectKBest(f_regression, k=2) #makes the k best using f_regression model
kbest.fit(x_train_scaled, y_train) #fits it to x_train_scaled and y_train 

SelectKBest(k=2, score_func=<function f_regression at 0x7fb1408f1ca0>)

In [78]:
kbest_results = pd.DataFrame(dict(p_value=kbest.pvalues_, f_score=kbest.scores_), index=x_train_scaled.columns) 
#gets the p values and f score and saves to a dict then saves them to a DF with the index being the x_train columns 

In [79]:
kbest_results.sort_values(by=['f_score'], ascending=False) 


Unnamed: 0,p_value,f_score
total_bill,1.30562e-19,111.115028
size,3.669012e-12,57.607134
price_per_person,0.000285709,13.831878
day_Sun,0.07197303,3.285602
time_Lunch,0.1411336,2.189558
day_Thur,0.3241839,0.978682
sex_encoded,0.4413546,0.596049
smoker_encoded,0.7716981,0.084507
day_Sat,0.8745567,0.025013


In [None]:
#recursive feature elimination 

In [80]:
#set the model to linear regres
model = LinearRegression()

# make it
rfe = RFE(model, n_features_to_select=2)

# fit it to the right things
rfe.fit(x_train_scaled, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [81]:
#create a DF for the output
rfe_ranking = pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=x_train_scaled.columns)
rfe_ranking.sort_values(by=['rfe_ranking'], ascending=True)

Unnamed: 0,rfe_ranking
total_bill,1
price_per_person,1
day_Sat,2
day_Thur,3
sex_encoded,4
time_Lunch,5
size,6
smoker_encoded,7
day_Sun,8
