<a href="https://colab.research.google.com/github/StephenSheng1101/DMAsgmnt/blob/main/Ridge_Regression_Model_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Ridge Regression**


**Data Preprocessing before begin with Ridge Regression**

In [63]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler

In [64]:
transport = pd.read_csv("dataset.csv")
transport.describe()
transport.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596558 entries, 0 to 2596557
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ID              object 
 1   travel_fee      float64
 2   departure_time  object 
 3   departure_long  float64
 4   departure_lat   float64
 5   arrival_long    float64
 6   arrival_lat     float64
 7   occupancy       int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 158.5+ MB


In [65]:
transport.isnull().sum()

ID                 0
travel_fee         0
departure_time     0
departure_long     0
departure_lat      0
arrival_long      44
arrival_lat       44
occupancy          0
dtype: int64

In [66]:
#handling missing value
transport["departure_long"].fillna(transport["departure_long"].mean(), inplace=True)
transport["departure_lat"].fillna(transport["departure_lat"].mean(), inplace=True)
transport["occupancy"].fillna(transport["occupancy"].mode()[0], inplace=True)
transport.drop(columns=['ID'], inplace=True)

# Drop rows with missing "departure_time","arrival_long" and "arrival_lat"
transport.dropna(subset=["departure_time", "arrival_long", "arrival_lat"],inplace = True)
transport.isnull().sum()

# Drop rows with a 0 value of occupancy
transport = transport.drop(transport[transport['occupancy'] <= 0].index)

# Handling Outliers
# Select columns for numeric attributes
num_attribs = ['departure_long', 'departure_lat', 'arrival_long', 'arrival_lat', 'occupancy', 'travel_fee']

# Remove outliers using Z-score method
z_scores = np.abs((transport[num_attribs] - transport[num_attribs].mean()) / transport[num_attribs].std())
transport = transport[(z_scores < 3).all(axis=1)]
transport.info()
transport

# Filter out rows with 0 values in departure_long, departure_lat, arrival_long, and arrival_lat
transport = transport[
    (transport["departure_long"] != 0) &
    (transport["departure_lat"] != 0) &
    (transport["arrival_long"] != 0) &
    (transport["arrival_lat"] != 0)
]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2378828 entries, 0 to 2596557
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   travel_fee      float64
 1   departure_time  object 
 2   departure_long  float64
 3   departure_lat   float64
 4   arrival_long    float64
 5   arrival_lat     float64
 6   occupancy       int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 145.2+ MB


In [67]:
from sklearn.metrics.pairwise import euclidean_distances
# Convert departure_time to datetime
transport['departure_time'] = pd.to_datetime(transport['departure_time'])

# Extract features from departure_time
transport['hour_of_day'] = transport['departure_time'].dt.hour
transport['day_of_week'] = transport['departure_time'].dt.dayofweek

# Calculate Euclidean distance between departure and arrival locations
transport['distance'] = transport.apply(lambda row: euclidean_distances([[row['departure_lat'], row['departure_long']]],
                                                                        [[row['arrival_lat'], row['arrival_long']]])[0][0], axis=1)

In [68]:
transport.info()
transport.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2378828 entries, 0 to 2596557
Data columns (total 10 columns):
 #   Column          Dtype              
---  ------          -----              
 0   travel_fee      float64            
 1   departure_time  datetime64[ns, UTC]
 2   departure_long  float64            
 3   departure_lat   float64            
 4   arrival_long    float64            
 5   arrival_lat     float64            
 6   occupancy       int64              
 7   hour_of_day     int64              
 8   day_of_week     int64              
 9   distance        float64            
dtypes: datetime64[ns, UTC](1), float64(6), int64(3)
memory usage: 199.6 MB


Unnamed: 0,travel_fee,departure_time,departure_long,departure_lat,arrival_long,arrival_lat,occupancy,hour_of_day,day_of_week,distance
0,7.0,2013-07-02 19:54:00+00:00,-74.00536,40.728867,-74.008913,40.710907,1,19,1,0.018308
1,5.5,2013-09-28 00:21:31+00:00,-74.014165,40.708941,-74.01631,40.716734,1,0,5,0.008083
2,21.5,2013-06-16 03:18:00+00:00,-73.991075,40.760352,-73.941382,40.713292,1,3,6,0.06844
3,9.5,2013-07-20 13:43:00+00:00,-74.002662,40.72363,-73.991722,40.748905,5,13,5,0.027541
4,15.5,2013-11-05 22:57:17+00:00,-73.962397,40.712705,-73.996834,40.680403,2,22,1,0.047216


In [69]:
from sklearn.model_selection import train_test_split
sample_size = 2000000
transport = transport.sample(n=sample_size)

#travelfee = transport[['hour_of_day', 'day_of_week','occupancy','distance']]
travelfee = transport[['occupancy','distance']]
travelfee_labels = transport["travel_fee"].copy()

travelfee_train, travelfee_test, travelfee_labels_train, travelfee_labels_test = train_test_split(travelfee, travelfee_labels, test_size=0.2, random_state=42)

print("Length of travelfee_train:", len(travelfee_train))
print("Length of travelfee_test:", len(travelfee_test))
print("Length of travelfee_labels_train:", len(travelfee_labels_train))
print("Length of travelfee_labels_test:", len(travelfee_labels_test))

Length of travelfee_train: 1600000
Length of travelfee_test: 400000
Length of travelfee_labels_train: 1600000
Length of travelfee_labels_test: 400000


In [70]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
#num_attribs = [ 'hour_of_day', 'day_of_week','occupancy','distance' ]
num_attribs = ['occupancy','distance' ]

num_pipeline = make_pipeline(
    StandardScaler()  # Scale features
)
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    # Apply numeric pipeline to numeric attributes
])

In [71]:
transport_prepared = preprocessing.fit_transform(transport)
transport_prepared

array([[ 3.17759017, -0.06081682],
       [-0.48966524, -0.21816246],
       [-0.48966524,  0.04647979],
       ...,
       [ 3.17759017, -0.1219065 ],
       [ 0.42714862, -0.11966608],
       [ 3.17759017,  0.24800664]])

**Linear Regression**


In [72]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
X2 = transport_prepared  # Features
y2 = transport['travel_fee']  # Target

In [73]:
# Split data into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [74]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
mlr.fit(X2, y2)

In [75]:
y_pred_linear = mlr.predict(X2)
y_pred_linear [0:5]

array([11.48843635, 10.76226008, 11.35392516, 10.76415578, 11.58298105])

In [76]:
mlr.intercept_

11.300000235000004

In [77]:
mlr.coef_

array([0.10209161, 2.23571662])

In [78]:
mlr_score = cross_val_score(mlr, X2_train, y2_train, scoring=None,verbose=2)
print(mlr_score)
print(mlr_score.mean(),mlr_score.std())

[CV] END .................................................... total time=   0.4s
[CV] END .................................................... total time=   0.5s
[CV] END .................................................... total time=   0.5s
[CV] END .................................................... total time=   0.4s
[CV] END .................................................... total time=   0.4s
[0.10136178 0.11571261 0.05498656 0.03898972 0.1129861 ]
0.08480735328194236 0.031659944665727094


**Select and Train a Model on the Prepared Training Set**

**Ridge Regression**

Ridge Regression is a regularization technique used to prevent overfitting in linear regression models by adding a penalty term to the cost function based on the L2 norm of the coefficients. Let's continue with implementing Ridge Regression the preprocessed data.

In [79]:
from sklearn.linear_model import Ridge
# Create and train Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X2_train, y2_train)

In [80]:
# Predict travel fees on test set
y_pred_ridge = ridge_model.predict(X2_test)
print("predicted: ",y_pred_ridge)

predicted:  [10.72197792 10.83703384 11.39372977 ... 11.44218806 11.94864451
 11.23437217]


In [81]:
ridge_score = cross_val_score(mlr, X2_train, y2_train, cv=5,verbose=2)
print("Ridge Score: ", ridge_score)
print(ridge_score.mean(),ridge_score.std())

[CV] END .................................................... total time=   0.3s
[CV] END .................................................... total time=   0.3s
[CV] END .................................................... total time=   0.3s
[CV] END .................................................... total time=   0.3s
[CV] END .................................................... total time=   0.3s
Ridge Score:  [0.10136178 0.11571261 0.05498656 0.03898972 0.1129861 ]
0.08480735328194236 0.031659944665727094


# Fine-tune Ridge Regression model (shortlisted promising models) using crossvalidation.


In [82]:
from sklearn.model_selection import GridSearchCV

parameters = {"fit_intercept":[True, False],'n_jobs':[None,1],'positive':[True,False]}

mlr = LinearRegression()
mlr_cv = GridSearchCV(estimator = mlr, param_grid = parameters ,cv = 10)
mlr_cv.fit(X2,y2)
print("Tuner hyperparameters: (best parameters)", mlr_cv.best_params_)
print("Accuracy", mlr_cv.best_score_)
best_mlr = mlr_cv.best_estimator_

Tuner hyperparameters: (best parameters) {'fit_intercept': True, 'n_jobs': None, 'positive': True}
Accuracy 0.08453209215490146


In [83]:
parameters = {"alpha":[0.001,0.01,0.02,0.03,0.04,0.05,0.06,0.08,1,2,3,5,8,10,20,50,100]}

ridge_cv = GridSearchCV(Ridge(),parameters,scoring='neg_mean_squared_error',cv = 10)
ridge_cv.fit(X2,y2)
print("Tuner hyperparameters: (best parameters)", ridge_cv.best_params_)
print("Accuracy", ridge_cv.best_score_)

best_ridge = ridge_cv.best_estimator_

Tuner hyperparameters: (best parameters) {'alpha': 100}
Accuracy -48.396021459563784


In [84]:
from sklearn.ensemble import VotingRegressor

voting_reg = VotingRegressor(
    estimators=[('mlr', best_mlr),('ridge', best_ridge)]
)
voting_reg.fit(X2,y2)

In [85]:
voting_reg.fit(X2,y2)

Evaluate on the Test Set for both Models

In [86]:
mlr = LinearRegression(fit_intercept=False, n_jobs= None, positive= False)
mlr.fit(X2, y2)
y_pred_linear = mlr_cv.predict(X2_test)

In [87]:
print(pd.DataFrame({'Actual Value':y2_test,'Predicted Values':y_pred_linear}))

         Actual Value  Predicted Values
726465            5.0         10.694937
505436            7.0         10.815868
114607           16.5         11.400990
1190165           9.5         10.945999
867302            9.0         10.867652
...               ...               ...
63392            28.5         13.447876
1411811           4.0         10.668664
485423           18.5         11.451922
505487           15.0         11.984239
1948224          12.5         11.233495

[400000 rows x 2 columns]


In [88]:
ridge = Ridge(alpha=100)
ridge.fit(X2,y2)
y_pred_ridge = ridge_cv.predict(X2_test)

In [89]:
print(pd.DataFrame({'Actual Value':y2_test,'Predicted Values':y_pred_ridge}))

         Actual Value  Predicted Values
726465            5.0         10.694967
505436            7.0         10.815892
114607           16.5         11.400984
1190165           9.5         10.946016
867302            9.0         10.867673
...               ...               ...
63392            28.5         13.447769
1411811           4.0         10.668696
485423           18.5         11.451914
505487           15.0         11.984204
1948224          12.5         11.233498

[400000 rows x 2 columns]
