<a href="https://colab.research.google.com/github/Dhanush-adk/machine_learning/blob/main/project/ML_Model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
#Use the URL for the raw CSV data
url = 'https://raw.githubusercontent.com/Dhanush-adk/machine_learning/main/project/data_preprocessed.csv'
df = pd.read_csv(url)

# Display the first 5 rows of the DataFrame
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,hour,weekday,month,year,distance
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,7,19,3,5,2015,1.683323
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,17,20,4,7,2009,2.45759
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,24,21,0,8,2009,5.036377
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,26,8,4,6,2009,1.661683
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,28,17,3,8,2014,4.47545


In [3]:
X = df.drop('fare_amount',axis=1)
y = df['fare_amount']

In [41]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Linear Regression(without standardizing the inputs)

In [42]:
model = LinearRegression()
model.fit(X_train, y_train)

In [43]:
model.score(X_train, y_train),  model.score(X_test, y_test)

(0.6885242492290915, 0.6186443704275573)

In [44]:
y_pred = model.predict(X_test)

In [45]:
mse = mean_squared_error(y_test, y_pred)
mse

0.39761706868684177

In [46]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2

(0.2521296196952796, 0.6186443704275573)

In [47]:
rmse = np.sqrt(mse)
rmse

0.6305688453189245

In [48]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#Using MinMax Scaler to normalize the inputs

In [49]:
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

In [50]:
X = df.drop('fare_amount',axis=1)
y = df[['fare_amount']]

In [51]:
X = pd.DataFrame(scaler_x.fit_transform(X), columns=X.columns, index=X.index).values
y = pd.DataFrame(scaler_y.fit_transform(y), columns=y.columns, index=y.index).values

In [52]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
model = LinearRegression()
model.fit(X_train, y_train)

In [54]:
model.score(X_train, y_train),  model.score(X_test, y_test)

(0.6885242492290917, 0.6186443704275568)

In [55]:
y_pred = model.predict(X_test)

In [56]:
mse = mean_squared_error(y_test, y_pred)
mse

0.00015591984351807426

In [57]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2

(0.00499277568129621, 0.6186443704275568)

In [58]:
rmse = np.sqrt(mse)
rmse

0.012486786757131486

#Using Standard Scaler to normalize the inputs

In [59]:
scaler_x = StandardScaler()
scaler_y = StandardScaler()

In [60]:
X = df.drop('fare_amount',axis=1)
y = df[['fare_amount']]

In [61]:
X = pd.DataFrame(scaler_x.fit_transform(X), columns=X.columns, index=X.index).values
y = pd.DataFrame(scaler_y.fit_transform(y), columns=y.columns, index=y.index).values

In [62]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
model = LinearRegression()
model.fit(X_train, y_train)

In [64]:
model.score(X_train, y_train),  model.score(X_test, y_test)

(0.6885242492290915, 0.6186443704275573)

In [65]:
y_pred = model.predict(X_test)

In [66]:
mse = mean_squared_error(y_test, y_pred)
mse

0.39761706868684177

In [67]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae, r2

(0.2521296196952796, 0.6186443704275573)

In [68]:
rmse = np.sqrt(mse)
rmse

0.6305688453189245

#Regularization model(Ridge Regression)

In [69]:
from sklearn.linear_model import Ridge
rid = Ridge()
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_test, y_test)

(0.688208669393279, 0.6183417185078421)

In [70]:
y_pred = rid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, mae, r2

(0.3979326260296938, 0.25184038856211594, 0.6183417185078421)

In [71]:
rmse = np.sqrt(mse)
rmse

0.6308190121022779

Using Regularization term (lambda = 5)

In [72]:
from sklearn.linear_model import Ridge
rid = Ridge(alpha = 5.0)
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_test, y_test)

(0.6874688613976456, 0.6176425872553735)

In [73]:
y_pred = rid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, mae, r2

(0.3986615690363714, 0.2519509772511201, 0.6176425872553735)

In [74]:
rmse = np.sqrt(mse)
rmse

0.6313965228256895

Using Regularization term (lambda = 10)

In [75]:
from sklearn.linear_model import Ridge
rid = Ridge(alpha = 10.0)
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_test, y_test)

(0.6872108146688205, 0.6173999162977113)

In [76]:
y_pred = rid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, mae, r2

(0.39891458776051936, 0.252032396286686, 0.6173999162977113)

In [77]:
rmse = np.sqrt(mse)
rmse

0.6315968554074025

Using Regularization term (lambda = 0.1)

In [78]:
from sklearn.linear_model import Ridge
rid = Ridge(alpha = 0.1)
rid.fit(X_train, y_train)
rid.score(X_train, y_train), rid.score(X_test, y_test)

(0.6885157631567227, 0.6186353351321203)

In [79]:
y_pred = rid.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, mae, r2

(0.397626489257584, 0.25205207854299894, 0.6186353351321203)

In [80]:
rmse = np.sqrt(mse)
rmse

0.6305763151733372