# TLC Trip Record Data Prediction
---

### Import packages

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

import random

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
df1= pd.read_csv('green_tripdata-one-.csv')
df1

In [None]:
df2= pd.read_csv('green_tripdata_two-.csv')
df2

In [None]:
con_data= pd.concat([df1,df2])   # concat two dataset
con_data.head(10)

In [None]:
con_data.shape

In [None]:
con_data.info()

In [None]:
# con_data = con_data.sample(frac=1,random_state=79).reset_index(drop=True)

In [None]:
duplicate = con_data.duplicated()
print(duplicate.sum())
con_data[duplicate]

In [None]:
# Simple Validation Method
con_data_train , con_data_test = train_test_split(con_data, test_size=0.2, random_state=199)

In [None]:
con_data_train2, con_data_val =  train_test_split(con_data_train, test_size=0.2, random_state=199)

## Relation Between Features

In [None]:
plt.rcParams["figure.figsize"] = (10,8);

# corr
con_data_corr = con_data.corr()

# mask
mask = np.triu(np.ones_like(con_data_corr, dtype=np.bool))

# adjust mask and df
mask = mask[1:, :-1]
corr = con_data_corr.iloc[1:,:-1].copy()

sns.heatmap(corr, cmap = 'icefire', annot = True, vmin= -1, vmax= 1, linewidths=1.5, fmt='.2f', mask=mask);
plt.title('CORRELATION MATRIX\n', loc='left', fontsize=18);

In [None]:
# convert type pickup datetime for train
con_data_train2['lpep_pickup_datetime']= pd.to_datetime(con_data_train2['lpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")

# convert type pickup datetime for val
con_data_val['lpep_pickup_datetime']= pd.to_datetime(con_data_val['lpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")

# convert type pickup datetime for test
con_data_test['lpep_pickup_datetime']= pd.to_datetime(con_data_test['lpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")

############################

# convert type dropoff datetime for train
con_data_train2['lpep_dropoff_datetime']= pd.to_datetime(con_data_train2['lpep_dropoff_datetime'],format="%Y-%m-%d %H:%M:%S") 

# convert type dropoff datetime for val
con_data_val['lpep_dropoff_datetime']= pd.to_datetime(con_data_val['lpep_dropoff_datetime'],format="%Y-%m-%d %H:%M:%S")

# convert type dropoff datetime for test
con_data_test['lpep_dropoff_datetime']= pd.to_datetime(con_data_test['lpep_dropoff_datetime'],format="%Y-%m-%d %H:%M:%S") 

In [None]:
con_data_train2.dtypes

In [None]:
con_data_val.dtypes

In [None]:
con_data_test.dtypes

In [None]:
#### Pickup datetime convert ####

# for train

#extract month
con_data_train2["month_pickup"] = pd.DatetimeIndex(con_data_train2["lpep_pickup_datetime"]).month

#extract week day 
con_data_train2["week_day_pickup"]= con_data_train2["lpep_pickup_datetime"].dt.weekday

#extract day 
con_data_train2["day_pickup"]= con_data_train2["lpep_pickup_datetime"].dt.day

#extract hour
con_data_train2["hour_pickup"]= con_data_train2["lpep_pickup_datetime"].dt.hour 

con_data_train2 =con_data_train2.sort_values(by = "lpep_pickup_datetime",ascending = True)

#####################

# for val

#extract month
con_data_val["month_pickup"] = pd.DatetimeIndex(con_data_val["lpep_pickup_datetime"]).month

#extract week day 
con_data_val["week_day_pickup"]= con_data_val["lpep_pickup_datetime"].dt.weekday

#extract day 
con_data_val["day_pickup"]= con_data_val["lpep_pickup_datetime"].dt.day

#extract hour
con_data_val["hour_pickup"]= con_data_val["lpep_pickup_datetime"].dt.hour 

con_data_val =con_data_val.sort_values(by = "lpep_pickup_datetime",ascending = True)

#####################

# for test

#extract month
con_data_test["month_pickup"] = pd.DatetimeIndex(con_data_test["lpep_pickup_datetime"]).month

#extract week day 
con_data_test["week_day_pickup"]= con_data_test["lpep_pickup_datetime"].dt.weekday

#extract day 
con_data_test["day_pickup"]= con_data_test["lpep_pickup_datetime"].dt.day

#extract hour
con_data_test["hour_pickup"]= con_data_test["lpep_pickup_datetime"].dt.hour 

con_data_test =con_data_test.sort_values(by = "lpep_pickup_datetime",ascending = True)

In [None]:
#### Dropoff datetime convert ####

# for train

#extract month
con_data_train2["month_dropoff"] = pd.DatetimeIndex(con_data_train2["lpep_dropoff_datetime"]).month

#extract week day 
con_data_train2["week_day_dropoff"]= con_data_train2["lpep_dropoff_datetime"].dt.weekday

#extract day 
con_data_train2["day_dropoff"]= con_data_train2["lpep_dropoff_datetime"].dt.day

#extract hour
con_data_train2["hour_dropoff"]= con_data_train2["lpep_dropoff_datetime"].dt.hour 

con_data_train2 =con_data_train2.sort_values(by = "lpep_dropoff_datetime",ascending = True)

#####################

# for val

#extract month
con_data_val["month_dropoff"] = pd.DatetimeIndex(con_data_val["lpep_dropoff_datetime"]).month

#extract week day 
con_data_val["week_day_dropoff"]= con_data_val["lpep_dropoff_datetime"].dt.weekday

#extract day 
con_data_val["day_dropoff"]= con_data_val["lpep_dropoff_datetime"].dt.day

#extract hour
con_data_val["hour_dropoff"]= con_data_val["lpep_dropoff_datetime"].dt.hour 

con_data_val =con_data_val.sort_values(by = "lpep_dropoff_datetime",ascending = True)

#####################

# for test

#extract month
con_data_test["month_dropoff"] = pd.DatetimeIndex(con_data_test["lpep_dropoff_datetime"]).month

#extract week day 
con_data_test["week_day_dropoff"]= con_data_test["lpep_dropoff_datetime"].dt.weekday

#extract day 
con_data_test["day_dropoff"]= con_data_test["lpep_dropoff_datetime"].dt.day

#extract hour
con_data_test["hour_dropoff"]= con_data_test["lpep_dropoff_datetime"].dt.hour 

con_data_test =con_data_test.sort_values(by = "lpep_dropoff_datetime",ascending = True)

In [None]:
con_data_train2.head()

In [None]:
con_data_val.head()

In [None]:
con_data_test.head()

In [None]:
con_data_train2.columns

In [None]:
con_data_val.columns

In [None]:
con_data_test.columns

In [None]:
#### Clean outliers for time ####


#### for train ####

date = pd.Timestamp(2021,1,1)
con_data_train2 = con_data_train2[con_data_train2['lpep_pickup_datetime'] >= date ]

date = pd.Timestamp(2021,2,28)
con_data_train2 = con_data_train2[con_data_train2['lpep_pickup_datetime'] <= date ]


#### for val ####

date = pd.Timestamp(2021,1,1)
con_data_val = con_data_val[con_data_val['lpep_pickup_datetime'] >= date ]

date = pd.Timestamp(2021,2,28)
con_data_val = con_data_val[con_data_val['lpep_pickup_datetime'] <= date ]


#### for test ####

date = pd.Timestamp(2021,1,1)
con_data_test = con_data_test[con_data_test['lpep_pickup_datetime'] >= date ]

date = pd.Timestamp(2021,2,28)
con_data_test = con_data_test[con_data_test['lpep_pickup_datetime'] <= date ]

In [None]:
# drop datetime columns for train 
con_data_train2= con_data_train2.drop(['lpep_pickup_datetime','lpep_dropoff_datetime'],axis =1)
# drop datetime columns for val 
con_data_val= con_data_val.drop(['lpep_pickup_datetime','lpep_dropoff_datetime'],axis =1)
# drop datetime columns for test 
con_data_test= con_data_test.drop(['lpep_pickup_datetime','lpep_dropoff_datetime'],axis =1)

In [None]:
con_data_train2.isna().sum()

In [None]:
con_data_val.isna().sum()

In [None]:
con_data_test.isna().sum()

In [None]:
# drop columns for Train
con_data_train2.drop(columns=['ehail_fee', 'congestion_surcharge','RatecodeID'], inplace=True)

# drop columns for Val
con_data_val.drop(columns=['ehail_fee', 'congestion_surcharge','RatecodeID'], inplace=True)

# drop columns for test
con_data_test.drop(columns=['ehail_fee', 'congestion_surcharge','RatecodeID'], inplace=True)

In [None]:
con_data_val.isna().sum()

In [None]:
#### for train ####

duplicate = con_data_train2.duplicated()
print(duplicate.sum())
con_data_train2[duplicate]

In [None]:
#### for train ####

# drop duplicates rows
con_data_train2.drop_duplicates(inplace=True)

In [None]:
#### for val ####

duplicate = con_data_val.duplicated()
print(duplicate.sum())
con_data_val[duplicate]

In [None]:
#### for val ####

# drop duplicates rows
con_data_val.drop_duplicates(inplace=True)

In [None]:
#### for test ####

duplicate = con_data_test.duplicated()
print(duplicate.sum())
con_data_test[duplicate]

In [None]:
#### for test ####

# drop duplicates rows
con_data_test.drop_duplicates(inplace=True)

In [None]:
#### fill nulls ####

# drop null in rows for Train
con_data_train2 = con_data_train2.dropna()

# drop null in rows for Val
con_data_val = con_data_val.dropna()

# drop null in rows for test
con_data_test = con_data_test.dropna()

In [None]:
con_data_train2.describe().transpose()

In [None]:
#### passenger_count outliers ####

# Remove passenger_count outliers for Train
con_data_train2 = con_data_train2[con_data_train2['passenger_count']>0]
con_data_train2 = con_data_train2[con_data_train2['passenger_count']<9]

# Remove passenger_count outliers for val
con_data_val = con_data_val[con_data_val['passenger_count']>0]
con_data_val = con_data_val[con_data_val['passenger_count']<9]

# Remove passenger_count outliers for test
con_data_test = con_data_test[con_data_test['passenger_count']>0]
con_data_test = con_data_test[con_data_test['passenger_count']<9]



#### zero distance trips ####

# removing zero distance trips for Train
con_data_train2 = con_data_train2[con_data_train2['trip_distance'] > 0]

# removing zero distance trips for val
con_data_val = con_data_val[con_data_val['trip_distance'] > 0]

# removing zero distance trips for test
con_data_test = con_data_test[con_data_test['trip_distance'] > 0]



#### zero/negative fares ####

# removing trips with zero/negative fares for Train
con_data_train2= con_data_train2[con_data_train2['fare_amount'] >= 0]

# removing trips with zero/negative fares for val
con_data_val= con_data_val[con_data_val['fare_amount'] >= 0]

# removing trips with zero/negative fares for test
con_data_test= con_data_test[con_data_test['fare_amount'] >= 0]

In [None]:
con_data_train2.info()

In [None]:
con_data_val.info()

In [None]:
con_data_test.info()

---
### working on (VendorID, store_and_fwd_flag, payment_type, trip_type) columns to get dummies 

In [None]:
con_data_train2.shape

In [None]:
con_data_val.shape

In [None]:
con_data_test.shape

In [None]:
con_data_train2['VendorID'].dtypes

In [None]:
con_data_val['VendorID'].dtypes

In [None]:
con_data_test['VendorID'].dtypes

In [None]:
# # VendorID type conversion to get dummies for train
# con_data_train2['VendorID'] = con_data_train2.VendorID.astype('category')
# # VendorID type conversion to get dummies for val
# con_data_val['VendorID'] = con_data_val.VendorID.astype('category')
# # VendorID type conversion to get dummies for test
# con_data_test['VendorID'] = con_data_test.VendorID.astype('category')

In [None]:
# con_data_train2['store_and_fwd_flag'].dtypes

In [None]:
# con_data_val['store_and_fwd_flag'].dtypes

In [None]:
# con_data_test['store_and_fwd_flag'].dtypes

In [None]:
# con_data_train2['payment_type'].dtypes

In [None]:
# con_data_val['payment_type'].dtypes

In [None]:
# con_data_test['payment_type'].dtypes

In [None]:
# # payment_type type conversion to get dummies for train
# con_data_train2['payment_type'] = con_data_train2.payment_type.astype('category')
# # payment_type type conversion to get dummies for val
# con_data_val['payment_type'] = con_data_val.payment_type.astype('category')
# # payment_type type conversion to get dummies for test
# con_data_test['payment_type'] = con_data_test.payment_type.astype('category')

In [None]:
# con_data_train2['trip_type'].dtypes

In [None]:
# con_data_val['trip_type'].dtypes

In [None]:
# con_data_test['trip_type'].dtypes

In [None]:
# # trip_type type conversion to get dummies for train 
# con_data_train2['trip_type'] = con_data_train2.trip_type.astype('category')
# # trip_type type conversion to get dummies for val
# con_data_val['trip_type'] = con_data_val.trip_type.astype('category')
# # trip_type type conversion to get dummies for test
# con_data_test['trip_type'] = con_data_test.trip_type.astype('category')

In [None]:
# get dummies for train 
con_data_train2 = pd.get_dummies(con_data_train2)

# get dummies for val 
con_data_val = pd.get_dummies(con_data_val)

# get dummies for test 
con_data_test = pd.get_dummies(con_data_test)

In [None]:
con_data_train2.shape

In [None]:
con_data_val.shape

In [None]:
con_data_test.shape

In [None]:
con_data_train2.columns

In [None]:
con_data_val.columns

In [None]:
con_data_test.columns

In [None]:
# # rename the columns for train
# con_data_train2.rename(columns={'VendorID_1.0': 'creative_mobile_technologies_LLC',
#                          'VendorID_2.0': 'verifone_inc',
#                          'store_and_fwd_flag_N': 'not_a_store_and_forward_trip',
#                          'store_and_fwd_flag_Y': 'store_and_forward_trip',
# #                          'RatecodeID_1.0': 'standard_rate',
# #                          'RatecodeID_2.0': 'JFK',
# #                          'RatecodeID_3.0': 'newark',
# #                          'RatecodeID_4.0': 'nassau_or_westchester',
# #                          'RatecodeID_5.0': 'negotiated_fare',
#                          'payment_type_1.0': 'credit_card',
#                          'payment_type_2.0': 'cash',
#                          'payment_type_3.0': 'no_charge',
#                          'payment_type_4.0': 'dispute',
#                          'payment_type_5.0': 'unknown',
#                          'trip_type_1.0': 'street-hail',
#                          'trip_type_2.0': 'dispatch',}, inplace= True)



# # rename the columns for val
# con_data_val.rename(columns={'VendorID_1.0': 'creative_mobile_technologies_LLC',
#                          'VendorID_2.0': 'verifone_inc',
#                          'store_and_fwd_flag_N': 'not_a_store_and_forward_trip',
#                          'store_and_fwd_flag_Y': 'store_and_forward_trip',
# #                          'RatecodeID_1.0': 'standard_rate',
# #                          'RatecodeID_2.0': 'JFK',
# #                          'RatecodeID_3.0': 'newark',
# #                          'RatecodeID_4.0': 'nassau_or_westchester',
# #                          'RatecodeID_5.0': 'negotiated_fare',
#                          'payment_type_1.0': 'credit_card',
#                          'payment_type_2.0': 'cash',
#                          'payment_type_3.0': 'no_charge',
#                          'payment_type_4.0': 'dispute',
#                          'payment_type_5.0': 'unknown',
#                          'trip_type_1.0': 'street-hail',
#                          'trip_type_2.0': 'dispatch',}, inplace= True)



# # rename the columns for test
# con_data_test.rename(columns={'VendorID_1.0': 'creative_mobile_technologies_LLC',
#                          'VendorID_2.0': 'verifone_inc',
#                          'store_and_fwd_flag_N': 'not_a_store_and_forward_trip',
#                          'store_and_fwd_flag_Y': 'store_and_forward_trip',
# #                          'RatecodeID_1.0': 'standard_rate',
# #                          'RatecodeID_2.0': 'JFK',
# #                          'RatecodeID_3.0': 'newark',
# #                          'RatecodeID_4.0': 'nassau_or_westchester',
# #                          'RatecodeID_5.0': 'negotiated_fare',
#                          'payment_type_1.0': 'credit_card',
#                          'payment_type_2.0': 'cash',
#                          'payment_type_3.0': 'no_charge',
#                          'payment_type_4.0': 'dispute',
#                          'payment_type_5.0': 'unknown',
#                          'trip_type_1.0': 'street-hail',
#                          'trip_type_2.0': 'dispatch',}, inplace= True)

In [None]:
con_data_train2.sample(7)

In [None]:
con_data_val.sample(7)

In [None]:
con_data_test.sample(7)

---
## Visualize data

In [None]:
# sns.pairplot(con_data_train2);

In [None]:
# frequency of fare_amount
plt.figure(figsize=(8, 6))
sns.histplot(con_data_train2['fare_amount'], bins = 75, color ='#c15a3a' , stat='density');
#sns.kdeplot(con_data_train2['fare_amount'], color='#533e98');
plt.xlabel('Fare amount');
plt.grid(axis='y', lw = 0.25);

In [None]:
fig1=plt.figure(figsize=(8, 6))
ax5=fig1.add_subplot(1,1,1)
ax5.scatter(con_data_train2.trip_distance, con_data_train2.fare_amount, color='#533e98',alpha=0.2)
ax5.set_title('The graph of payment depending on the trip distance')
ax5.set_xlabel('Day of the week')
ax5.set_ylabel('Payment');

fig2=plt.figure(figsize=(8, 6))
#ax5=fig2.add_subplot(1,1,1)
#


data=[con_data_train2.passenger_count, con_data_train2.fare_amount]
plt.boxplot(data)
plt.show()
#

#ax5.scatter(con_data_train2.passenger_count, con_data_train2.fare_amount, color='#c15a3a',alpha=0.2)
ax5.set_title('The graph of payment depending on the number of passengers');
ax5.set_xlabel('Number of passengers')
ax5.set_ylabel('Payment');

In [None]:
con_data_train2['passenger_count'].value_counts().reset_index()

In [None]:
con_data_val['passenger_count'].value_counts().reset_index()

In [None]:
con_data_test['passenger_count'].value_counts().reset_index()

In [None]:
# # # relationship between hour_dropoff and fare_amount
# # # plot(x = con_data_train2['hour_dropoff'], y = con_data_train2['fare_amount'])
# con_data_train2.plot(x='fare_amount', y=['hour_pickup','hour_dropoff'], kind='barh');

In [None]:
# con_data_train2.plot(x='fare_amount', y=['not_a_store_and_forward_trip','store_and_forward_trip'], kind='barh');

In [None]:
# taxi trip repartition by hour of the day
sns.catplot(x='hour_pickup', kind='count', palette='icefire', data=con_data_train2, height=3, aspect=3);
plt.title('Hour of Day');

In [None]:
# # taxi trip repartition by pickup month
# sns.catplot(x=['month_pickup','month_dropoff'], kind='count', palette='icefire', data=con_data_train2, height=3, aspect=3);
# plt.title('Pickup Month');

1. Does the number of passengers affect the fare?

In [None]:
plt.figure(figsize=(8,6))
plt.hist(con_data_train2['passenger_count'], bins=100, color='#533e98' )
plt.xlabel('No. of Passengers')
plt.ylabel('Frequency');

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x=con_data_train2['passenger_count'], y=con_data_train2['fare_amount'], s=10, color='#c15a3a',alpha=0.2)
plt.xlabel('No. of Passengers')
plt.ylabel('Fare');

2. Does the time of pickup affect the fare?

In [None]:
plt.figure(figsize=(8,6))
plt.hist(con_data_train2['hour_pickup'], bins=100, color='#533e98')
plt.xlabel('Hour Pickup')
plt.ylabel('Frequency');

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x=con_data_train2['hour_pickup'], y=con_data_train2['fare_amount'], s=10, color='#c15a3a',alpha=0.2)
plt.xlabel('Hour Pickup')
plt.ylabel('Fare');

3. Does the day of the week affect the fare?

In [None]:
plt.figure(figsize=(8,6))
plt.hist(con_data_train2['week_day_pickup'], bins=100, color='#533e98')
plt.xlabel('Day pickup of Week')
plt.ylabel('Frequency');

 day of the week doesn't seem to have that much of an influence on the number of cab rides.

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x=con_data_train2['week_day_pickup'], y=con_data_train2['fare_amount'], s=10, color='#c15a3a',alpha=0.2)
plt.xlabel('Day pickup of Week')
plt.ylabel('Fare');

---
## Scaling
scaling the features makes interpretation of regression coefficients easier

In [None]:
con_data_val.shape

In [None]:
con_data_train2.shape

In [None]:
con_data_test.shape

In [None]:
scaler = StandardScaler()

con_data_train2[con_data_train2.columns]=scaler.fit_transform(con_data_train2[con_data_train2.columns])
con_data_val[con_data_val.columns]=scaler.transform(con_data_val[con_data_val.columns])
# con_data_test[con_data_test.columns]=scaler.transform(con_data_test[con_data_test.columns])

X_train=con_data_train2.drop('fare_amount',axis=1)
y_train=con_data_train2['fare_amount']
X_val=con_data_val.drop('fare_amount',axis=1)
y_val=con_data_val['fare_amount']
# X_test=con_data_test.drop('fare_amount',axis=1)
# y_test=con_data_test['fare_amount']

In [None]:
print("Length of the X_train = ",len(X_train))
print("Length of the y_train = ",len(y_train))
# print("Length of the X_test = ",len(X_test))
# print("Length of the y_test = ",len(y_test))
print("Length of the y_val = ",len(y_val))
print("Length of the y_val = ",len(y_val))

---
### Model Building
Model with all features

In [None]:
seed = 199
lm = LinearRegression()
lm1 = lm.fit(X_train,y_train)
y_pred_val = lm1.predict(X_val)
y_pred_train = lm1.predict(X_train)
# y_pred_test = lm1.predict(X_test)
print("R-sq of training set = ",lm1.score(X_train,y_train))

In [None]:
print("R-sq of validation set = ",lm1.score(X_val,y_val))

In [None]:
# plot
pred = lm.predict(X_train) 
#sns.jointplot(x= pred, y= y_train, kind='reg', color='#c15a3a');

In [None]:
# from sklearn import metrics
#print('\nLinear Regression Performance Metrics')
#print('R^2=',metrics.explained_variance_score(y_test,y_pred_lm))
# print('MAE:',metrics.mean_absolute_error(y_test,y_pred_lm))
# print('MSE:',metrics.mean_squared_error(y_test,y_pred_lm))
# print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_pred_lm)))

In [None]:
y_pred_train.shape

In [None]:
y_train.shape

In [None]:
X_train.shape 

In [None]:
X_val.shape

In [None]:
y_val.shape

---
## Ridge Regularization

In [None]:
#  train data
lr_model_ridge = Ridge(alpha = 100)
lr_model_ridge.fit(X_train, y_train)
lm_model_ridge.score(X_train, y_train)




In [None]:
#  val data 
lm_model_ridge = Ridge(alpha = 100)
lm_model_ridge.fit(X_train, y_train)
lm_model_ridge.score(X_val, y_val)

In [None]:
#Mean Absolute Error (MAE)
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 


In [None]:
alphalist = 10**(np.linspace(-2,2,200))
err_vec_val = np.zeros(len(alphalist))
err_vec_train = np.zeros(len(alphalist))

for i,curr_alpha in enumerate(alphalist):

    
    steps = [('standardize', StandardScaler()), 
             ('Ridge', Ridge(alpha = curr_alpha))]

    pipe = Pipeline(steps)
    pipe.fit(X_train.loc[:,:].values, y_train)
    
    val_set_pred = pipe.predict(X_val.loc[:,:].values)
    err_vec_val[i] = mae(y_val, val_set_pred)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(np.log10(alphalist), err_vec_val, color='#c15a3a');

In [None]:
np.min(err_vec_val)

In [None]:
alphalist[np.argmin(err_vec_val)]

## Polynomial

---

In [None]:
# for train
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X_train)

In [None]:
print(x_)

In [None]:
model = LinearRegression().fit(x_, y_train)

In [None]:
r_sq = model.score(x_, y_train)

In [None]:
print('coefficient of determination:', r_sq)

In [None]:
print('intercept:', model.intercept_)

In [None]:
# for val
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X_val)

In [None]:
model = LinearRegression().fit(x_, y_val)

In [None]:
r_sq = model.score(x_, y_val)

In [None]:
print('coefficient of determination:', r_sq)

In [None]:
print('intercept:', model.intercept_)