<a href="https://colab.research.google.com/github/StephenSheng1101/DMAsgmnt/blob/main/Model_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler

In [15]:
transport = pd.read_csv("dataset.csv")

In [16]:
transport.describe()
transport.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596558 entries, 0 to 2596557
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ID              object 
 1   travel_fee      float64
 2   departure_time  object 
 3   departure_long  float64
 4   departure_lat   float64
 5   arrival_long    float64
 6   arrival_lat     float64
 7   occupancy       int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 158.5+ MB


In [17]:
transport.isnull().sum()

ID                 0
travel_fee         0
departure_time     0
departure_long     0
departure_lat      0
arrival_long      44
arrival_lat       44
occupancy          0
dtype: int64

In [18]:
#handling missing value
transport["departure_long"].fillna(transport["departure_long"].mean(), inplace=True)
transport["departure_lat"].fillna(transport["departure_lat"].mean(), inplace=True)
transport["occupancy"].fillna(transport["occupancy"].mode()[0], inplace=True)
transport.drop(columns=['ID'], inplace=True)

# Drop rows with missing "departure_time","arrival_long" and "arrival_lat"
transport.dropna(subset=["departure_time", "arrival_long", "arrival_lat"],inplace = True)
transport.isnull().sum()

# Drop rows with a 0 value of occupancy
transport = transport.drop(transport[transport['occupancy'] <= 0].index)

# Handling Outliers
# Select columns for numeric attributes
num_attribs = ['departure_long', 'departure_lat', 'arrival_long', 'arrival_lat', 'occupancy', 'travel_fee']

# Remove outliers using Z-score method
z_scores = np.abs((transport[num_attribs] - transport[num_attribs].mean()) / transport[num_attribs].std())
transport = transport[(z_scores < 3).all(axis=1)]
transport.info()
transport

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2378828 entries, 0 to 2596557
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   travel_fee      float64
 1   departure_time  object 
 2   departure_long  float64
 3   departure_lat   float64
 4   arrival_long    float64
 5   arrival_lat     float64
 6   occupancy       int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 145.2+ MB


Unnamed: 0,travel_fee,departure_time,departure_long,departure_lat,arrival_long,arrival_lat,occupancy
0,7.00,2013-07-02 19:54:00+00:00,-74.005360,40.728867,-74.008913,40.710907,1
1,5.50,2013-09-28 00:21:31+00:00,-74.014165,40.708941,-74.016310,40.716734,1
2,21.50,2013-06-16 03:18:00+00:00,-73.991075,40.760352,-73.941382,40.713292,1
3,9.50,2013-07-20 13:43:00+00:00,-74.002662,40.723630,-73.991722,40.748905,5
4,15.50,2013-11-05 22:57:17+00:00,-73.962397,40.712705,-73.996834,40.680403,2
...,...,...,...,...,...,...,...
2596552,11.00,2013-04-17 10:34:24+00:00,-74.002635,40.760825,-73.978540,40.751177,1
2596554,6.50,2013-12-13 19:04:00+00:00,-73.955435,40.765110,-73.960947,40.756727,1
2596555,41.33,2013-05-25 14:48:39+00:00,-73.862824,40.768935,-73.984470,40.739395,2
2596556,7.50,2013-12-22 20:45:19+00:00,-73.970539,40.761767,-73.982731,40.744562,1


In [19]:
transport.head()

Unnamed: 0,travel_fee,departure_time,departure_long,departure_lat,arrival_long,arrival_lat,occupancy
0,7.0,2013-07-02 19:54:00+00:00,-74.00536,40.728867,-74.008913,40.710907,1
1,5.5,2013-09-28 00:21:31+00:00,-74.014165,40.708941,-74.01631,40.716734,1
2,21.5,2013-06-16 03:18:00+00:00,-73.991075,40.760352,-73.941382,40.713292,1
3,9.5,2013-07-20 13:43:00+00:00,-74.002662,40.72363,-73.991722,40.748905,5
4,15.5,2013-11-05 22:57:17+00:00,-73.962397,40.712705,-73.996834,40.680403,2


In [20]:
# Convert departure_time to datetime
transport['departure_time'] = pd.to_datetime(transport['departure_time'])

# Extract features from departure_time
transport['hour_of_day'] = transport['departure_time'].dt.hour
transport['day_of_week'] = transport['departure_time'].dt.dayofweek

In [21]:
from sklearn.model_selection import train_test_split


train_set, test_set = train_test_split(transport.head(500000), test_size=0.2, random_state=42)

print("Length of train_set:", len(train_set))
print("Length of test_set:", len(test_set))

Length of train_set: 400000
Length of test_set: 100000


In [22]:
travelfee = train_set.drop("travel_fee", axis=1)
travelfee_labels = train_set["travel_fee"].copy()

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

num_attribs = [ 'departure_long', 'departure_lat', 'arrival_long', 'arrival_lat', 'hour_of_day', 'day_of_week','occupancy']

num_pipeline = make_pipeline(
    StandardScaler()  # Scale features
)
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    # Apply numeric pipeline to numeric attributes
])


In [24]:
travelfee_prepared = preprocessing.fit_transform(travelfee)
travelfee_prepared

array([[-0.28420307, -1.21082781, -0.53654539, ...,  0.37792462,
         1.51306249,  3.18207706],
       [ 0.04477485,  0.21021648, -0.63257574, ...,  1.29241509,
         1.0017459 , -0.48964476],
       [-0.45958577, -0.58837285, -0.14115703, ...,  1.29241509,
         1.0017459 , -0.48964476],
       ...,
       [-0.07390843,  0.27601313, -0.10970653, ...,  1.14000001,
        -0.02088728, -0.48964476],
       [-0.29254122,  0.09154919, -0.07704352, ...,  0.98758493,
        -1.55483706, -0.48964476],
       [-0.30677022, -0.74234167, -0.01902985, ..., -0.2317357 ,
         1.51306249, -0.48964476]])

In [25]:
preprocessing.get_feature_names_out()

array(['num__departure_long', 'num__departure_lat', 'num__arrival_long',
       'num__arrival_lat', 'num__hour_of_day', 'num__day_of_week',
       'num__occupancy'], dtype=object)

In [27]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# Define features and target variables
features = ["departure_long", "departure_lat", "arrival_long", "arrival_lat", "occupancy", "hour_of_day", "day_of_week"]
X_train = travelfee[features]
y_train = travelfee_labels
X_test = test_set[features]
y_test = test_set["travel_fee"]

# Initialize the SVR model
svr_model = SVR(kernel='linear', C=10.0)

# Train the model with tqdm
with tqdm(total=len(X_train)) as pbar:
    for i in range(len(X_train)):
        svr_model.fit(X_train.iloc[[i]], [y_train.iloc[i]])
        pbar.update(1)

# Predict on the test set
y_pred = svr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

100%|█████████████████████████████████████████████████████████████████████████| 400000/400000 [29:27<00:00, 226.31it/s]

Mean Squared Error: 57.327363356999996



