In [23]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_selection import RFE
from pydataset import data
from math import sqrt
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings("ignore")
from  wrangle import wrangle_zillow

pd.set_option('display.max_columns', None)

# 1.Select a dataset with a continuous target variable.

In [2]:
def split_data(df, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed) 
                                            
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed)
                                       
    return train, validate, test
    


In [3]:
# import swiss data from pydataset
df= data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# check if there is any null values.
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [5]:
#changing categorical column to binary
df['sex']= df['sex'].map({'Female':0, 'Male':1})
df['smoker']= df['smoker'].map({'No':0, 'Yes':1})
df['time']= df['time'].map({'Lunch':0,'Dinner':1})

In [6]:
#get dummies for the day column
df=pd.get_dummies(df,columns=['day'], drop_first= True)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Sat,day_Sun,day_Thur
1,16.99,1.01,0,0,1,2,0,1,0
2,10.34,1.66,1,0,1,3,0,1,0
3,21.01,3.5,1,0,1,3,0,1,0
4,23.68,3.31,1,0,1,2,0,1,0
5,24.59,3.61,0,0,1,4,0,1,0


In [10]:
#scale the data using a copy of the dtaframe. columns to be scaled total_bill and size
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler
scaler.fit(df[['total_bill','size']])


#transform the data
df[['total_bill','size']] = scaler.transform(df[['total_bill','size']])
df.head()


 


Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Sat,day_Sun,day_Thur
1,0.291579,1.01,0,0,1,0.2,0,1,0
2,0.152283,1.66,1,0,1,0.4,0,1,0
3,0.375786,3.5,1,0,1,0.4,0,1,0
4,0.431713,3.31,1,0,1,0.2,0,1,0
5,0.450775,3.61,0,0,1,0.6,0,1,0


In [12]:
# split the data using split-data function
train, validate, test = split_data(df)
print(train.shape, validate.shape,test.shape)

(136, 9) (59, 9) (49, 9)


In [48]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Sat,day_Sun,day_Thur
19,0.29116,3.5,0,0,1,0.4,0,1,0
173,0.087558,5.15,1,1,1,0.2,0,1,0
119,0.196062,1.8,0,0,0,0.2,0,0,1
29,0.390239,4.3,1,0,1,0.2,1,0,0
238,0.623377,1.17,1,1,1,0.2,1,0,0


In [49]:
train.shape

(136, 9)

In [50]:
X_train = train.drop(columns=['tip'])
y_train= train.tip

X_validate = validate.drop(columns=['tip'])
y_validate = validate.tip

X_test= test.drop(columns=['tip'])
y_test = test.tip



In [34]:
X_train

Unnamed: 0,total_bill,sex,smoker,time,size,day_Sat,day_Sun,day_Thur
19,0.29116,0,0,1,0.4,0,1,0
173,0.087558,1,1,1,0.2,0,1,0
119,0.196062,0,0,0,0.2,0,0,1
29,0.390239,1,0,1,0.2,1,0,0
238,0.623377,1,1,1,0.2,1,0,0
208,0.746963,1,1,1,0.6,1,0,0
184,0.421031,1,1,1,0.6,0,1,0
61,0.360704,1,1,1,0.2,1,0,0
42,0.301424,1,0,1,0.2,0,1,0
161,0.386049,1,0,1,0.6,0,1,0


In [51]:
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape

((136, 8), (136,), (59, 8), (59,))

In [52]:
X_train.shape, X_validate.shape, X_test.shape, y_validate.shape

((136, 8), (59, 8), (49, 8), (59,))

In [18]:
# baseline = train.tip.mean()
# train['baseline'] = baseline
# validate['baseline'] = baseline


# # root mean square error for the train and validate sample with baseline
# train_rmse = sqrt(mean_squared_error(train.tip, train.baseline))
# valid_rmse = sqrt(mean_squared_error(validate.tip, validate.baseline))

# print(f'Train baseline RMSE: {train_rmse}')
# print('Validation baseline RMSE: {}.'.format(valid_rmse))



In [53]:
# fit the linear regression model to scaled train data samoples
lm = LinearRegression()

lm.fit(X_train, y_train)


LinearRegression()

In [54]:
#predictions using  x_train
y_pred_lm= lm.predict(X_train)
rmse = sqrt(mean_squared_error(y_train, y_pred_lm))
print(f'train RMSE{rmse}')

#prdictions using validate sample
y_pred = lm.predict(X_validate)
rmse = sqrt(mean_squared_error(y_validate, y_pred))
print(f'validate RMSE {rmse}')

train RMSE1.020076250050266
validate RMSE 1.1562209094697196


In [55]:
# using the test sample on the model
y_pred_test= lm.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred_test))
print(f'test RMSE{rmse}')

test RMSE0.8503868072402969


In [44]:
#fit  lars and fit
lars = LassoLars(alpha=1.0)

lars.fit(X_train, y_train)

LassoLars()

In [45]:
#fit TweedieRegressor model
glm = TweedieRegressor(power=0, alpha=1)

glm.fit(X_train, y_train)

TweedieRegressor(alpha=1, power=0)

In [57]:
# evaluate all three models created above
models=[lars,glm]

for model in models:
    y_train[str(model)] = model.predict(X_train)
    rmse_train = sqrt(mean_squared_error(y_train,
                                         y_train[str(model)]))
    
    y_validate[str(model)] = model.predict(X_validate)
    rmse_validate = sqrt(mean_squared_error(y_validate,
                                         y_validate[str(model)]))
    
    print('RMSE for {} model on the train dataset: {}.'.format(model, round(rmse_train, 2)))
    print('RMSE for {} model on the validate dataset: {}.'.format(model, round(rmse_valid, 2)))
    print()