# TLC Trip Record Data Prediction
---

---
## Import packages 

In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
%config InlineBackend.figure_format = 'svg'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')
import random
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RidgeCV
from yellowbrick.regressor import prediction_error

In [None]:
df1= pd.read_csv('green_tripdata-one-.csv')
df1

In [None]:
df2= pd.read_csv('green_tripdata_two-.csv')
df2

In [None]:
con_data= pd.concat([df1,df2])   # concat two dataset
con_data.head(10)

In [None]:
con_data= con_data.reset_index(drop= True)

In [None]:
con_data.shape

In [None]:
con_data.info()

In [None]:
duplicate = con_data.duplicated()
print(duplicate.sum())
con_data[duplicate]

In [None]:
duplicate = con_data.index.duplicated()
print(duplicate.sum())

In [None]:
# split the data for train and test
con_data_train , con_data_test = train_test_split(con_data, test_size=0.2, random_state=199)

In [None]:
# split the train for train and val
con_data_train2, con_data_val =  train_test_split(con_data_train, test_size=0.2, random_state=199)

---
## Feature Engineering on Time

In [None]:
# convert type pickup datetime for train
con_data_train2['lpep_pickup_datetime']= pd.to_datetime(con_data_train2['lpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")

# convert type pickup datetime for val
con_data_val['lpep_pickup_datetime']= pd.to_datetime(con_data_val['lpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")

# convert type pickup datetime for test
con_data_test['lpep_pickup_datetime']= pd.to_datetime(con_data_test['lpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")

############################

# convert type dropoff datetime for train
con_data_train2['lpep_dropoff_datetime']= pd.to_datetime(con_data_train2['lpep_dropoff_datetime'],format="%Y-%m-%d %H:%M:%S") 

# convert type dropoff datetime for val
con_data_val['lpep_dropoff_datetime']= pd.to_datetime(con_data_val['lpep_dropoff_datetime'],format="%Y-%m-%d %H:%M:%S")

# convert type dropoff datetime for test
con_data_test['lpep_dropoff_datetime']= pd.to_datetime(con_data_test['lpep_dropoff_datetime'],format="%Y-%m-%d %H:%M:%S") 

In [None]:
con_data_train2.dtypes

In [None]:
con_data_val.dtypes

In [None]:
con_data_test.dtypes

In [None]:
#### Pickup datetime convert ####

# for train

#extract month
con_data_train2["month_pickup"] = pd.DatetimeIndex(con_data_train2["lpep_pickup_datetime"]).month

#extract week day 
con_data_train2["week_day_pickup"]= con_data_train2["lpep_pickup_datetime"].dt.weekday

#extract day 
con_data_train2["day_pickup"]= con_data_train2["lpep_pickup_datetime"].dt.day

#extract hour
con_data_train2["hour_pickup"]= con_data_train2["lpep_pickup_datetime"].dt.hour 

con_data_train2 =con_data_train2.sort_values(by = "lpep_pickup_datetime",ascending = True)

#####################

# for val

#extract month
con_data_val["month_pickup"] = pd.DatetimeIndex(con_data_val["lpep_pickup_datetime"]).month

#extract week day 
con_data_val["week_day_pickup"]= con_data_val["lpep_pickup_datetime"].dt.weekday

#extract day 
con_data_val["day_pickup"]= con_data_val["lpep_pickup_datetime"].dt.day

#extract hour
con_data_val["hour_pickup"]= con_data_val["lpep_pickup_datetime"].dt.hour 

con_data_val =con_data_val.sort_values(by = "lpep_pickup_datetime",ascending = True)

#####################

# for test

#extract month
con_data_test["month_pickup"] = pd.DatetimeIndex(con_data_test["lpep_pickup_datetime"]).month

#extract week day 
con_data_test["week_day_pickup"]= con_data_test["lpep_pickup_datetime"].dt.weekday

#extract day 
con_data_test["day_pickup"]= con_data_test["lpep_pickup_datetime"].dt.day

#extract hour
con_data_test["hour_pickup"]= con_data_test["lpep_pickup_datetime"].dt.hour 

con_data_test =con_data_test.sort_values(by = "lpep_pickup_datetime",ascending = True)

In [None]:
#### Dropoff datetime convert ####

# for train

#extract month
con_data_train2["month_dropoff"] = pd.DatetimeIndex(con_data_train2["lpep_dropoff_datetime"]).month

#extract week day 
con_data_train2["week_day_dropoff"]= con_data_train2["lpep_dropoff_datetime"].dt.weekday

#extract day 
con_data_train2["day_dropoff"]= con_data_train2["lpep_dropoff_datetime"].dt.day

#extract hour
con_data_train2["hour_dropoff"]= con_data_train2["lpep_dropoff_datetime"].dt.hour 

con_data_train2 =con_data_train2.sort_values(by = "lpep_dropoff_datetime",ascending = True)

#####################

# for val

#extract month
con_data_val["month_dropoff"] = pd.DatetimeIndex(con_data_val["lpep_dropoff_datetime"]).month

#extract week day 
con_data_val["week_day_dropoff"]= con_data_val["lpep_dropoff_datetime"].dt.weekday

#extract day 
con_data_val["day_dropoff"]= con_data_val["lpep_dropoff_datetime"].dt.day

#extract hour
con_data_val["hour_dropoff"]= con_data_val["lpep_dropoff_datetime"].dt.hour 

con_data_val =con_data_val.sort_values(by = "lpep_dropoff_datetime",ascending = True)

#####################

# for test

#extract month
con_data_test["month_dropoff"] = pd.DatetimeIndex(con_data_test["lpep_dropoff_datetime"]).month

#extract week day 
con_data_test["week_day_dropoff"]= con_data_test["lpep_dropoff_datetime"].dt.weekday

#extract day 
con_data_test["day_dropoff"]= con_data_test["lpep_dropoff_datetime"].dt.day

#extract hour
con_data_test["hour_dropoff"]= con_data_test["lpep_dropoff_datetime"].dt.hour 

con_data_test =con_data_test.sort_values(by = "lpep_dropoff_datetime",ascending = True)

In [None]:
con_data_train2.hour_pickup.unique()

In [None]:
con_data_train2.week_day_pickup.unique()

In [None]:
# adding rush hours
def rushhour(hour):
    if hour in [5,6,7,8,17,18,19]:
        return 1
    else: return 0
 
# apply method
con_data_train2['rush_hour'] = con_data_train2.hour_pickup.apply(rushhour)
con_data_val['rush_hour'] = con_data_val.hour_pickup.apply(rushhour)
con_data_test['rush_hour'] = con_data_test.hour_pickup.apply(rushhour)

#adding work days
def workday(day):
    if day in [0,1,2,3,4]:
        return 1
    else: return 0

  # apply method
con_data_train2['work_day'] = con_data_train2.week_day_pickup.apply(workday)
con_data_val['work_day'] = con_data_val.week_day_pickup.apply(workday)
con_data_test['work_day'] = con_data_test.week_day_pickup.apply(workday)


In [None]:
con_data_train2.sample(10)

In [None]:
con_data_val.head()

In [None]:
con_data_test.head()

In [None]:
con_data_train2.columns

In [None]:
con_data_val.columns

In [None]:
con_data_test.columns

In [None]:
#### Clean outliers for time ####


#### for train ####

date = pd.Timestamp(2021,1,1)
con_data_train2 = con_data_train2[con_data_train2['lpep_pickup_datetime'] >= date ]

date = pd.Timestamp(2021,2,28)
con_data_train2 = con_data_train2[con_data_train2['lpep_pickup_datetime'] <= date ]


#### for val ####

date = pd.Timestamp(2021,1,1)
con_data_val = con_data_val[con_data_val['lpep_pickup_datetime'] >= date ]

date = pd.Timestamp(2021,2,28)
con_data_val = con_data_val[con_data_val['lpep_pickup_datetime'] <= date ]


#### for test ####

date = pd.Timestamp(2021,1,1)
con_data_test = con_data_test[con_data_test['lpep_pickup_datetime'] >= date ]

date = pd.Timestamp(2021,2,28)
con_data_test = con_data_test[con_data_test['lpep_pickup_datetime'] <= date ]

In [None]:
# drop datetime columns for train 
con_data_train2= con_data_train2.drop(['lpep_pickup_datetime','lpep_dropoff_datetime'],axis =1)
# drop datetime columns for val 
con_data_val= con_data_val.drop(['lpep_pickup_datetime','lpep_dropoff_datetime'],axis =1)
# drop datetime columns for test 
con_data_test= con_data_test.drop(['lpep_pickup_datetime','lpep_dropoff_datetime'],axis =1)

---
## Data Pre-Processing

In [None]:
con_data_train2.isna().sum()

In [None]:
con_data_val.isna().sum()

In [None]:
con_data_test.isna().sum()

In [None]:
# drop columns for Train
con_data_train2.drop(columns=['ehail_fee'], inplace=True)

# drop columns for Val
con_data_val.drop(columns=['ehail_fee'], inplace=True)

# drop columns for test
con_data_test.drop(columns=['ehail_fee'], inplace=True)

In [None]:
con_data_val.isna().sum()

In [None]:
#### for train ####

duplicate = con_data_train2.duplicated()
print(duplicate.sum())
con_data_train2[duplicate]

In [None]:
#### for train ####

# drop duplicates rows
con_data_train2.drop_duplicates(inplace=True)

In [None]:
#### for val ####

duplicate = con_data_val.duplicated()
print(duplicate.sum())
con_data_val[duplicate]

In [None]:
#### for val ####

# drop duplicates rows
con_data_val.drop_duplicates(inplace=True)

In [None]:
#### for test ####

duplicate = con_data_test.duplicated()
print(duplicate.sum())
con_data_test[duplicate]

In [None]:
#### for test ####

# drop duplicates rows
con_data_test.drop_duplicates(inplace=True)

In [None]:
#### fill nulls ####

# drop null in rows for Train
con_data_train2 = con_data_train2.dropna()

# drop null in rows for Val
con_data_val = con_data_val.dropna()

# drop null in rows for test
con_data_test = con_data_test.dropna()

In [None]:
con_data_train2.describe().transpose()

In [None]:
con_data_train2['passenger_count'].value_counts()

In [None]:
con_data_train2['trip_distance'].value_counts()

In [None]:
con_data_train2['trip_distance'].nlargest(10)

In [None]:
con_data_train2['fare_amount'].value_counts()

In [None]:
con_data_train2['payment_type'].value_counts()

In [None]:
con_data_train2['total_amount'].value_counts()

In [None]:
#### passenger_count outliers ####

# Remove passenger_count outliers for Train
con_data_train2 = con_data_train2[con_data_train2['passenger_count']>0]
con_data_train2 = con_data_train2[con_data_train2['passenger_count']<7]

# Remove passenger_count outliers for val
con_data_val = con_data_val[con_data_val['passenger_count']>0]
con_data_val = con_data_val[con_data_val['passenger_count']<7]

# Remove passenger_count outliers for test
con_data_test = con_data_test[con_data_test['passenger_count']>0]
con_data_test = con_data_test[con_data_test['passenger_count']<7]



#### zero distance trips ####

# removing zero distance trips for Train
con_data_train2 = con_data_train2[con_data_train2['trip_distance'] > 0]
con_data_train2 = con_data_train2[con_data_train2['trip_distance'] <= 200]

# removing zero distance trips for val
con_data_val = con_data_val[con_data_val['trip_distance'] > 0]
con_data_val = con_data_val[con_data_val['trip_distance'] <= 200]

# removing zero distance trips for test
con_data_test = con_data_test[con_data_test['trip_distance'] > 0]
con_data_test = con_data_test[con_data_test['trip_distance'] <= 200]



#### zero/negative fares ####

# removing trips with zero/negative fares for Train
con_data_train2= con_data_train2[con_data_train2['fare_amount'] > 0]
con_data_train2= con_data_train2[con_data_train2['total_amount'] > 0]

# removing trips with zero/negative fares for val
con_data_val= con_data_val[con_data_val['fare_amount'] > 0]
con_data_val= con_data_val[con_data_val['total_amount'] > 0]

# removing trips with zero/negative fares for test
con_data_test= con_data_test[con_data_test['fare_amount'] > 0]
con_data_test= con_data_test[con_data_test['total_amount'] > 0]



#### payment type ####

# removing payment type more than 4 for Train
con_data_train2= con_data_train2[con_data_train2['payment_type'] <= 4]

# removing payment type more than 4 for val
con_data_val= con_data_val[con_data_val['payment_type'] <= 4]

# removing payment type more than 4 for test
con_data_test= con_data_test[con_data_test['payment_type'] <= 4]

In [None]:
con_data_train2.info()

In [None]:
con_data_val.info()

In [None]:
con_data_test.info()

---
## Get Dummies 

In [None]:
con_data_train2.shape

In [None]:
con_data_val.shape

In [None]:
con_data_test.shape

In [None]:
con_data_train2['store_and_fwd_flag'].dtypes

In [None]:
con_data_train2['RatecodeID'].value_counts()

In [None]:
con_data_val['RatecodeID'].value_counts()

In [None]:
con_data_test['RatecodeID'].value_counts()

In [None]:
# RatecodeID type conversion to get dummies
con_data_train2['RatecodeID'] = con_data_train2.RatecodeID.astype('category')
con_data_val['RatecodeID'] = con_data_val.RatecodeID.astype('category')
con_data_test['RatecodeID'] = con_data_test.RatecodeID.astype('category')

In [None]:
con_data_train2['payment_type'].value_counts()

In [None]:
con_data_val['payment_type'].value_counts()

In [None]:
con_data_test['payment_type'].value_counts()

In [None]:
# payment_type type conversion to get dummies
con_data_train2['payment_type'] = con_data_train2.payment_type.astype('category')
con_data_val['payment_type'] = con_data_val.payment_type.astype('category')
con_data_test['payment_type'] = con_data_test.payment_type.astype('category')

In [None]:
# get dummies for train 
con_data_train2 = pd.get_dummies(con_data_train2)

# get dummies for val 
con_data_val = pd.get_dummies(con_data_val)

# get dummies for test 
con_data_test = pd.get_dummies(con_data_test)

In [None]:
con_data_train2.shape

In [None]:
con_data_val.shape

In [None]:
con_data_test.shape

In [None]:
con_data_train2.columns

In [None]:
con_data_val.columns

In [None]:
con_data_test.columns

In [None]:
# rename the columns for train
con_data_train2.rename(columns={'store_and_fwd_flag_N': 'not_a_store_and_forward_trip',
                         'store_and_fwd_flag_Y': 'store_and_forward_trip',
                         'RatecodeID_1.0': 'standard_rate',
                         'RatecodeID_2.0': 'JFK',
                         'RatecodeID_3.0': 'newark',
                         'RatecodeID_4.0': 'nassau_or_westchester',
                         'RatecodeID_5.0': 'negotiated_fare',
                         'payment_type_1.0': 'credit_card',
                         'payment_type_2.0': 'cash',
                         'payment_type_3.0': 'no_charge',
                         'payment_type_4.0': 'dispute'
                         }, inplace= True)



# rename the columns for val
con_data_val.rename(columns={'store_and_fwd_flag_N': 'not_a_store_and_forward_trip',
                         'store_and_fwd_flag_Y': 'store_and_forward_trip',
                         'RatecodeID_1.0': 'standard_rate',
                         'RatecodeID_2.0': 'JFK',
                         'RatecodeID_3.0': 'newark',
                         'RatecodeID_4.0': 'nassau_or_westchester',
                         'RatecodeID_5.0': 'negotiated_fare',
                         'payment_type_1.0': 'credit_card',
                         'payment_type_2.0': 'cash',
                         'payment_type_3.0': 'no_charge',
                         'payment_type_4.0': 'dispute'
                         }, inplace= True)


# rename the columns for test
con_data_test.rename(columns={'store_and_fwd_flag_N': 'not_a_store_and_forward_trip',
                         'store_and_fwd_flag_Y': 'store_and_forward_trip',
                         'RatecodeID_1.0': 'standard_rate',
                         'RatecodeID_2.0': 'JFK',
                         'RatecodeID_3.0': 'newark',
                         'RatecodeID_4.0': 'nassau_or_westchester',
                         'RatecodeID_5.0': 'negotiated_fare',
                         'payment_type_1.0': 'credit_card',
                         'payment_type_2.0': 'cash',
                         'payment_type_3.0': 'no_charge',
                         'payment_type_4.0': 'dispute'
                         }, inplace= True)

In [None]:
con_data_train2.sample(7)

In [None]:
con_data_val.sample(7)

In [None]:
con_data_test.sample(7)

---
## Sample data for modling

In [None]:
train_sample = con_data_train2[[ 'passenger_count'	,'improvement_surcharge','congestion_surcharge',
                   'week_day_pickup','hour_pickup','work_day','trip_distance',
                   'store_and_forward_trip',
                   'standard_rate','JFK','newark','nassau_or_westchester','negotiated_fare',
                   'credit_card', 'cash', 'no_charge', 'dispute'
                   ]]
val_sample = con_data_val[['passenger_count'	,'improvement_surcharge', 'congestion_surcharge',
                   'week_day_pickup','hour_pickup','work_day','trip_distance',
                   'store_and_forward_trip',
                   'standard_rate','JFK','newark','nassau_or_westchester','negotiated_fare',
                   'credit_card', 'cash', 'no_charge', 'dispute'
                   ]]
test_sample = con_data_test[[ 'passenger_count'	,'improvement_surcharge','congestion_surcharge',
                   'week_day_pickup','hour_pickup','work_day','trip_distance',
                   'store_and_forward_trip',
                   'standard_rate','JFK','newark','nassau_or_westchester','negotiated_fare',
                   'credit_card', 'cash', 'no_charge', 'dispute'
                   ]]

---
## Visualize data

In [None]:
# frequency of fare_amount
plt.figure(figsize=(8, 6))
sns.histplot(con_data_train2['fare_amount'], bins = 75, color ='#533e98' , stat='density', kde=True)
plt.title('Fare Distribution');
plt.xlabel('Fare Amount');
plt.grid(axis='y', lw = 0.25);
# plt.savefig('plot1.png', dpi = 300, bbox_inches = 'tight');

In [None]:
#create new variable log of fare amount
con_data_train2["log_fare_amount"] = np.log(con_data_train2["fare_amount"])

In [None]:
plt.figure(figsize = (8,5))
sns.distplot(con_data_train2["log_fare_amount"],color ='#533e98')
plt.axvline(con_data_train2["log_fare_amount"].mean(),color = "k",
            linestyle = "dashed",label = "Avg fare amount")
plt.title("Distribution in log of fare amount")
plt.legend(loc = "best",prop = {"size" : 12});
# plt.savefig('plot2.png', dpi = 300, bbox_inches = 'tight');

In [None]:
fig1=plt.figure(figsize=(8, 6))
ax1=fig1.add_subplot(1,1,1)
ax1.scatter(con_data_train2.trip_distance, con_data_train2.fare_amount, color='#533e98',alpha=0.1)
ax1.set_title('The graph of payment depending on the trip distance')
ax1.set_xlabel('Distance')
ax1.set_ylabel('Payment');

fig2= plt.figure(figsize=(8, 6))
ax2= fig2.add_subplot(1,1,1)
ax2.scatter(con_data_train2.passenger_count, con_data_train2.fare_amount, color='#c15a3a',alpha=0.1)
ax2.set_title('The graph of payment depending on the number of passengers');
ax2.set_xlabel('Number of passengers')
ax2.set_ylabel('Payment');
# plt.savefig('plot3.png', dpi = 300, bbox_inches = 'tight');

In [None]:
con_data_train2['passenger_count'].value_counts()

In [None]:
# passenger count in trips distribution
plt.figure(figsize=(9, 19))
pass_count = con_data_train2['passenger_count'].value_counts()
c = ['#6f5e8f', '#c15a3a', '#ade5e1', '#11415f', '#dfb08b', '#92576e']
plt.pie(pass_count,labels=None
        , autopct="%0.1f%%", pctdistance=1.15, colors=c);
plt.legend(title = 'Passenger count:',
           labels=['One person', 'Two persons','Five persons', 'Three persons', 'Six persons','Four persons']);
# plt.savefig('plot4.png', dpi = 300, bbox_inches = 'tight');

In [None]:
# taxi trip repartition by hour of the day
plt.figure(figsize=(8,6));
sns.catplot(x='hour_pickup', kind='count', palette='icefire', data=con_data_train2, height=3, aspect=3);
plt.title('Hour of Day');
# plt.savefig('plot5.png', dpi = 300, bbox_inches = 'tight');

In [None]:
plt.figure(figsize=(8,6));
sns.lineplot(data = con_data_train2, x='hour_pickup',y='total_amount',palette=['#6f5e8f', '#c15a3a'], hue='month_pickup')
plt.xticks(np.arange(0, 24, 1))
plt.legend(title = 'Pickup Months:',
           labels=['January', 'February']);
plt.xlabel('Pick Up Hours')
plt.ylabel('Total Amount')
plt.title('Rush Hour of Day Efficting on Payment in January and February');
plt.grid(axis='both', lw = 0.25);
# plt.savefig('plot6.png', dpi = 300, bbox_inches = 'tight');

1. Does the number of passengers affect the fare?

In [None]:
plt.figure(figsize=(8,6))
plt.hist(con_data_train2['passenger_count'], bins=100, color='#533e98' )
plt.xlabel('No. of Passengers')
plt.ylabel('Frequency');
# plt.savefig('plot7.png', dpi = 300, bbox_inches = 'tight');

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x=con_data_train2['passenger_count'], y=con_data_train2['fare_amount'], s=10, color='#c15a3a',alpha=0.2)
plt.xlabel('No. of Passengers')
plt.ylabel('Fare');
# plt.savefig('plot8.png', dpi = 300, bbox_inches = 'tight');

2. Does the time of pickup affect the fare?

In [None]:
plt.figure(figsize=(8,6))
plt.hist(con_data_train2['hour_pickup'], bins=100, color='#533e98')
plt.xlabel('Pickup Hour')
plt.ylabel('Frequency');
# plt.savefig('plot9.png', dpi = 300, bbox_inches = 'tight');

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x=con_data_train2['hour_pickup'], y=con_data_train2['fare_amount'], s=10, color='#c15a3a',alpha=0.2)
plt.xlabel('Pickup Hour')
plt.ylabel('Fare');
# plt.savefig('plot10.png', dpi = 300, bbox_inches = 'tight');

3. Does the day of the week affect the fare?

In [None]:
plt.figure(figsize=(8,6))
positions = (0, 1, 2, 3, 4, 5, 6)
labels = ('Mon','Tue','Wed','Thu','Fri','Sat','Sun')
plt.xticks(positions,labels); 
plt.hist(con_data_train2['week_day_pickup'], bins=100, color='#533e98')
plt.xlabel('Day pickup of Week')
plt.ylabel('Frequency');
# plt.savefig('plot11.png', dpi = 300, bbox_inches = 'tight');

 day of the week doesn't seem to have that much of an influence on the number of cab rides.

In [None]:
plt.figure(figsize=(8,6))
positions = (0, 1, 2, 3, 4, 5, 6)
labels = ('Mon','Tue','Wed','Thu','Fri','Sat','Sun')
plt.xticks(positions,labels); 
plt.scatter(x=con_data_train2['week_day_pickup'], y=con_data_train2['fare_amount'], s=10, color='#c15a3a',alpha=0.2)
plt.xlabel('Day pickup of Week')
plt.ylabel('Fare');
# plt.savefig('plot12.png', dpi = 300, bbox_inches = 'tight');

---
## Relation Between Features

In [None]:
plt.rcParams["figure.figsize"] = (20,18);

# corr
con_data_corr = con_data_train2.corr()

# mask
mask = np.triu(np.ones_like(con_data_corr, dtype=np.bool))

# adjust mask and df
mask = mask[1:, :-1]
corr = con_data_corr.iloc[1:,:-1].copy()

sns.heatmap(corr, cmap = 'icefire', annot = True, vmin= -1, vmax= 1, linewidths=1.5, fmt='.2f', mask=mask);
plt.title('CORRELATION BETWEEN FEATURES\n', loc='left', fontsize=18);
# plt.savefig('plot13.png', dpi = 300, bbox_inches = 'tight');

In [None]:
con_data_corr = con_data_train2.corr()['fare_amount'][:-1] 
corr_features = con_data_corr[abs(con_data_corr) > 0.5].sort_values(ascending=False)
print('Strongly correlated features with fare amount:\n{}'.format(corr_features))

---
## Scaling
scaling the features makes interpretation of regression coefficients easier

In [None]:
con_data_val.shape

In [None]:
con_data_train2.shape

In [None]:
con_data_test.shape

In [None]:
scaler = StandardScaler()

train_sample[train_sample.columns]=scaler.fit_transform(train_sample[train_sample.columns])
val_sample[val_sample.columns]=scaler.transform(val_sample[val_sample.columns])
test_sample[test_sample.columns]=scaler.transform(test_sample[test_sample.columns])



In [None]:
X_train=train_sample
y_train=con_data_train2['fare_amount']
X_val=val_sample
y_val=con_data_val['fare_amount']
X_test=test_sample
y_test=con_data_test['fare_amount']

In [None]:
print("Length of the X_train = ",len(X_train))
print("Length of the y_train = ",len(y_train))
print("Length of the X_test = ",len(X_test))
print("Length of the y_test = ",len(y_test))
print("Length of the y_val = ",len(y_val))
print("Length of the y_val = ",len(y_val))

---
## Model Building

In [None]:
seed = 199
lm = LinearRegression()
lm1 = lm.fit(X_train,y_train)
y_pred_val = lm1.predict(X_val)
y_pred_train = lm1.predict(X_train)
y_pred_test = lm1.predict(X_test)
print("R-sq of training set = ",lm1.score(X_train,y_train))
print("R-sq of validation set = ",lm1.score(X_val,y_val))
print("R-sq of Test set = ",lm1.score(X_test,y_test))

In [None]:
# plot
pred = lm.predict(X_train) 
sns.jointplot(x= pred, y= y_train, kind='reg', color='#533e98');
# plt.savefig('plot14.png', dpi = 300, bbox_inches = 'tight');

In [None]:
from sklearn import metrics
print('\nLinear Regression Performance Metrics')
print('R^2=',metrics.explained_variance_score(y_test,y_pred_test))
print('MAE:',metrics.mean_absolute_error(y_test,y_pred_test))
print('MSE:',metrics.mean_squared_error(y_test,y_pred_test))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_pred_test)))

In [None]:
visualizer = prediction_error(lm, X_train, y_train, X_test, y_test, size=(700, 400),color='y')
# plt.savefig('plot15.png', dpi = 300, bbox_inches = 'tight');

---
## Ridge Regularization

In [None]:
#  train data
lm_model_ridge = Ridge(alpha = 0.01)
lm_model_ridge.fit(X_train, y_train)
lm_model_ridge.score(X_train, y_train)

In [None]:
#  val data 
lm_model_ridge = Ridge(alpha = 0.01)
lm_model_ridge.fit(X_train, y_train)
lm_model_ridge.score(X_val, y_val)

In [None]:
#  Test data 
lm_model_ridge = Ridge(alpha = 0.01)
lm_model_ridge.fit(X_train, y_train)
lm_model_ridge.score(X_test, y_test)

In [None]:
#Mean Absolute Error (MAE)
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 

In [None]:
alphalist = 10**(np.linspace(-2,2,200))
err_vec_val = np.zeros(len(alphalist))
err_vec_train = np.zeros(len(alphalist))

for i,curr_alpha in enumerate(alphalist):

    
    steps = [('standardize', StandardScaler()), 
             ('Ridge', Ridge(alpha = curr_alpha))]

    pipe = Pipeline(steps)
    pipe.fit(X_train.loc[:,:].values, y_train)
    
    val_set_pred = pipe.predict(X_val.loc[:,:].values)
    err_vec_val[i] = mae(y_val, val_set_pred)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(np.log10(alphalist), err_vec_val, color='#c15a3a');
# plt.savefig('plot16.png', dpi = 300, bbox_inches = 'tight');

In [None]:
np.min(err_vec_val)

In [None]:
alphalist[np.argmin(err_vec_val)]

---
## Lasso Model

In [None]:
# train  
lm_model_lasso = Lasso(alpha = 0.01)
lm_model_lasso.fit(X_train, y_train)
lm_model_lasso.score(X_train, y_train)

In [None]:
# val  
lm_model_lasso = Lasso(alpha = 0.01)
lm_model_lasso.fit(X_train, y_train)
lm_model_lasso.score(X_val, y_val)

In [None]:
#  Test data 
lm_model_ridge = Ridge(alpha = 0.01)
lm_model_ridge.fit(X_train, y_train)
lm_model_ridge.score(X_test, y_test)

In [None]:
alphalist = 10**(np.linspace(-2,2,200))
er_vec_val = np.zeros(len(alphalist))
er_vec_train = np.zeros(len(alphalist))

for i,curr_alpha in enumerate(alphalist):

    
    steps = [('standardize', StandardScaler()), 
             ('Lasso', Lasso(alpha = curr_alpha))]

    pipe = Pipeline(steps)
    pipe.fit(X_train.loc[:,:].values, y_train)
    
    val_set_pred = pipe.predict(X_val.loc[:,:].values)
    er_vec_val[i] = mae(y_val, val_set_pred)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(np.log10(alphalist), er_vec_val, color='#c15a3a');
# plt.savefig('plot17.png', dpi = 300, bbox_inches = 'tight');

In [None]:
np.min(er_vec_val)

In [None]:
alphalist[np.argmin(err_vec_val)]