In [40]:
import numpy as np
import pandas as pd
import wrangle
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from pydataset import data

In [2]:
df = pd.read_csv('tips.csv')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [4]:
# 1a. Create a column price_per_person

df['price_per_person'] = round((df.total_bill / df['size']),2)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15


In [5]:
#1b. Objects need to be encoded via dummies, mapping, etc
dummy_df = pd.get_dummies(df[['sex','smoker','day','time']],prefix=['sex','smoker','day','time'])

In [6]:
df = pd.concat([df,dummy_df],axis=1)

In [8]:
df.head()
#df.info()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,1,0,1,0,0,0,1,0,1,0
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,0,1,1,0,0,0,1,0,1,0
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,0,1,1,0,0,0,1,0,1,0
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,0,1,1,0,0,0,1,0,1,0
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,1,0,1,0,0,0,1,0,1,0


In [11]:
df.drop(columns=['sex','smoker','day','time','sex_Male','smoker_No','day_Thur','time_Lunch'],inplace=True)

In [13]:
# Most important predictors most likely will be Total_bill, party size, day_Sat, and time_Dinner
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Female,smoker_Yes,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,8.49,1,0,0,0,1,1
1,10.34,1.66,3,3.45,0,0,0,0,1,1
2,21.01,3.5,3,7.0,0,0,0,0,1,1
3,23.68,3.31,2,11.84,0,0,0,0,1,1
4,24.59,3.61,4,6.15,1,0,0,0,1,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   size              244 non-null    int64  
 3   price_per_person  244 non-null    float64
 4   sex_Female        244 non-null    uint8  
 5   smoker_Yes        244 non-null    uint8  
 6   day_Fri           244 non-null    uint8  
 7   day_Sat           244 non-null    uint8  
 8   day_Sun           244 non-null    uint8  
 9   time_Dinner       244 non-null    uint8  
dtypes: float64(3), int64(1), uint8(6)
memory usage: 9.2 KB


In [14]:
train, validate, test = wrangle.split_zillow(df)

In [15]:
train.shape,validate.shape,test.shape

((136, 10), (34, 10), (74, 10))

In [16]:
X_train = train.drop(columns='tip')
y_train = train['tip']

In [19]:
X_train.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Female,smoker_Yes,day_Fri,day_Sat,day_Sun,time_Dinner
15,21.58,2,10.79,0,0,0,0,1,1
52,34.81,4,8.7,1,0,0,0,1,1
205,16.47,3,5.49,1,1,0,0,0,0
162,16.21,3,5.4,1,0,0,0,1,1
17,16.29,3,5.43,0,0,0,0,1,1


In [20]:
# make the thing

kbest = SelectKBest(f_regression,k=2)

# fit the thing
_ = kbest.fit(X_train,y_train)

In [21]:
# statistical f-value:
kbest.scores_,kbest.pvalues_
#p value: 
  # exam 1 and exam3 based upon greater F_SCORES!!!

(array([113.35781848,  60.16250283,  14.14766124,   2.18152947,
          0.53365166,   2.0241628 ,   0.55205051,   4.92345035,
          1.11103418]),
 array([1.46730557e-19, 1.96830700e-12, 2.51642148e-04, 1.42021941e-01,
        4.66350959e-01, 1.57138044e-01, 4.58781680e-01, 2.81770186e-02,
        2.93754123e-01]))

In [23]:
pd.DataFrame({'column':X_train.columns,
            'f_score':kbest.scores_,
             'p_value':kbest.pvalues_})

# Looks like total_bill,size,and price_per_person are top 3

Unnamed: 0,column,f_score,p_value
0,total_bill,113.357818,1.4673059999999998e-19
1,size,60.162503,1.968307e-12
2,price_per_person,14.147661,0.0002516421
3,sex_Female,2.181529,0.1420219
4,smoker_Yes,0.533652,0.466351
5,day_Fri,2.024163,0.157138
6,day_Sat,0.552051,0.4587817
7,day_Sun,4.92345,0.02817702
8,time_Dinner,1.111034,0.2937541


In [24]:
# 1c.
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [25]:
kbest.get_support()
X_train.iloc[:,kbest.get_support()].head()

Unnamed: 0,total_bill,size
15,21.58,2
52,34.81,4
205,16.47,3
162,16.21,3
17,16.29,3


In [27]:
# establish a model for RFE to use

model = LinearRegression()

In [28]:
# make an RFE thing
rfe = RFE(model, n_features_to_select=2)

In [29]:
# fit the RFE thing
rfe.fit(X_train,y_train)

In [30]:
rfe.ranking_

array([7, 1, 8, 1, 3, 6, 5, 2, 4])

In [31]:
pd.DataFrame({
    'rfe_ranking':rfe.ranking_
}, index=X_train.columns)

Unnamed: 0,rfe_ranking
total_bill,7
size,1
price_per_person,8
sex_Female,1
smoker_Yes,3
day_Fri,6
day_Sat,5
day_Sun,2
time_Dinner,4


In [32]:
rfe.support_  #or rfe.get_support()

array([False,  True, False,  True, False, False, False, False, False])

In [33]:
pd.DataFrame(rfe.transform(X_train),
            columns=X_train.columns[rfe.get_support()],index=X_train.index).head()

Unnamed: 0,size,sex_Female
15,2.0,0.0
52,4.0,1.0
205,3.0,1.0
162,3.0,1.0
17,3.0,0.0


#### 2 write a function

In [34]:
def select_kbest(X_train,y_train,top_k=3):
    kbest = SelectKBest(f_regression,k=top_k)
    _ = kbest.fit(X_train,y_train)
    
    return X_train.columns[kbest.get_support()]

In [35]:
select_kbest(X_train,y_train,top_k=2)

Index(['total_bill', 'size'], dtype='object')

#### 3 write another function

In [38]:
def rfe(X_train,y_train,top_k=3):
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=top_k)
    rfe.fit(X_train,y_train)
    return X_train.columns[rfe.get_support()]

In [39]:
rfe(X_train,y_train,top_k=2)

Index(['size', 'sex_Female'], dtype='object')

#### 4 Swiss data set

In [41]:
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [43]:
train, validate, test = wrangle.split_zillow(df)

In [44]:
train.shape,validate.shape,test.shape

((25, 6), (7, 6), (15, 6))

In [45]:
X_train = train.drop(columns='Fertility')
y_train = train['Fertility']

In [46]:
X_train.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Conthey,85.9,3,2,99.71,15.1
La Vallee,15.2,31,20,2.15,10.8
Rive Gauche,27.7,22,29,58.33,19.3
ValdeTravers,18.7,25,7,8.65,19.5
Le Locle,16.7,22,13,11.22,18.9


In [47]:
select_kbest(X_train,y_train,top_k=3)

Index(['Agriculture', 'Examination', 'Education'], dtype='object')

In [48]:
rfe(X_train,y_train,top_k=3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')