<a href="https://colab.research.google.com/github/StephenSheng1101/DMAsgmnt/blob/main/Model_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler

In [28]:
transport = pd.read_csv("dataset.csv")

In [29]:
transport.describe()
transport.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596558 entries, 0 to 2596557
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ID              object 
 1   travel_fee      float64
 2   departure_time  object 
 3   departure_long  float64
 4   departure_lat   float64
 5   arrival_long    float64
 6   arrival_lat     float64
 7   occupancy       int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 158.5+ MB


In [30]:
transport.isnull().sum()

ID                 0
travel_fee         0
departure_time     0
departure_long     0
departure_lat      0
arrival_long      44
arrival_lat       44
occupancy          0
dtype: int64

In [31]:
#handling missing value
transport["departure_long"].fillna(transport["departure_long"].mean(), inplace=True)
transport["departure_lat"].fillna(transport["departure_lat"].mean(), inplace=True)
transport["occupancy"].fillna(transport["occupancy"].mode()[0], inplace=True)
transport.drop(columns=['ID'], inplace=True)

# Drop rows with missing "departure_time","arrival_long" and "arrival_lat"
transport.dropna(subset=["departure_time", "arrival_long", "arrival_lat"],inplace = True)
transport.isnull().sum()

# Drop rows with a 0 value of occupancy
transport = transport.drop(transport[transport['occupancy'] <= 0].index)

# Handling Outliers
# Select columns for numeric attributes
num_attribs = ['departure_long', 'departure_lat', 'arrival_long', 'arrival_lat', 'occupancy', 'travel_fee']

# Remove outliers using Z-score method
z_scores = np.abs((transport[num_attribs] - transport[num_attribs].mean()) / transport[num_attribs].std())
transport = transport[(z_scores < 3).all(axis=1)]
transport.info()
transport

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2378828 entries, 0 to 2596557
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   travel_fee      float64
 1   departure_time  object 
 2   departure_long  float64
 3   departure_lat   float64
 4   arrival_long    float64
 5   arrival_lat     float64
 6   occupancy       int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 145.2+ MB


Unnamed: 0,travel_fee,departure_time,departure_long,departure_lat,arrival_long,arrival_lat,occupancy
0,7.00,2013-07-02 19:54:00+00:00,-74.005360,40.728867,-74.008913,40.710907,1
1,5.50,2013-09-28 00:21:31+00:00,-74.014165,40.708941,-74.016310,40.716734,1
2,21.50,2013-06-16 03:18:00+00:00,-73.991075,40.760352,-73.941382,40.713292,1
3,9.50,2013-07-20 13:43:00+00:00,-74.002662,40.723630,-73.991722,40.748905,5
4,15.50,2013-11-05 22:57:17+00:00,-73.962397,40.712705,-73.996834,40.680403,2
...,...,...,...,...,...,...,...
2596552,11.00,2013-04-17 10:34:24+00:00,-74.002635,40.760825,-73.978540,40.751177,1
2596554,6.50,2013-12-13 19:04:00+00:00,-73.955435,40.765110,-73.960947,40.756727,1
2596555,41.33,2013-05-25 14:48:39+00:00,-73.862824,40.768935,-73.984470,40.739395,2
2596556,7.50,2013-12-22 20:45:19+00:00,-73.970539,40.761767,-73.982731,40.744562,1


In [32]:
transport.head()

Unnamed: 0,travel_fee,departure_time,departure_long,departure_lat,arrival_long,arrival_lat,occupancy
0,7.0,2013-07-02 19:54:00+00:00,-74.00536,40.728867,-74.008913,40.710907,1
1,5.5,2013-09-28 00:21:31+00:00,-74.014165,40.708941,-74.01631,40.716734,1
2,21.5,2013-06-16 03:18:00+00:00,-73.991075,40.760352,-73.941382,40.713292,1
3,9.5,2013-07-20 13:43:00+00:00,-74.002662,40.72363,-73.991722,40.748905,5
4,15.5,2013-11-05 22:57:17+00:00,-73.962397,40.712705,-73.996834,40.680403,2


In [33]:
# Convert departure_time to datetime
transport['departure_time'] = pd.to_datetime(transport['departure_time'])

# Extract features from departure_time
transport['hour_of_day'] = transport['departure_time'].dt.hour
transport['day_of_week'] = transport['departure_time'].dt.dayofweek

Determine whether the taxi ride is high demand or not based on departure location, hour of day, and day of a week. Still figuring out how.

In [14]:
# Define criteria for demand categorization based on hour_of_day
low_demand_criteria = range(0, 6)  # Early morning hours
medium_demand_criteria = range(6, 18)  # Daytime hours
high_demand_criteria = range(18, 24)  # Evening hours

# Create a function to assign demand categories
def assign_demand_category(hour):
    if hour in low_demand_criteria:
        return "low"
    elif hour in medium_demand_criteria:
        return "medium"
    else:
        return "high"

# Create a new column for demand category
transport["demand_category"] = transport["hour_of_day"].apply(assign_demand_category)

In [15]:
transport.head()

Unnamed: 0,travel_fee,departure_time,departure_long,departure_lat,arrival_long,arrival_lat,occupancy,hour_of_day,day_of_week,demand_category
0,7.0,2013-07-02 19:54:00+00:00,-74.00536,40.728867,-74.008913,40.710907,1,19,1,high
1,5.5,2013-09-28 00:21:31+00:00,-74.014165,40.708941,-74.01631,40.716734,1,0,5,low
2,21.5,2013-06-16 03:18:00+00:00,-73.991075,40.760352,-73.941382,40.713292,1,3,6,low
3,9.5,2013-07-20 13:43:00+00:00,-74.002662,40.72363,-73.991722,40.748905,5,13,5,medium
4,15.5,2013-11-05 22:57:17+00:00,-73.962397,40.712705,-73.996834,40.680403,2,22,1,high


In [16]:
from sklearn.model_selection import train_test_split


train_set, test_set = train_test_split(transport.head(500000), test_size=0.2, random_state=42)

print("Length of train_set:", len(train_set))
print("Length of test_set:", len(test_set))

Length of train_set: 400000
Length of test_set: 100000


In [37]:
demand = train_set.drop("demand_category", axis=1)
demand = train_set.drop("travel_fee", axis=1)
demand = train_set.drop("departure_time", axis=1)
demand = train_set.drop("arrival_lat", axis=1)
demand = train_set.drop("arrival_long", axis=1)
demand = train_set.drop("occupancy", axis=1)
demand_labels = train_set["demand_category"].copy()

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

num_attribs = [ 'departure_long', 'departure_lat', 'hour_of_day', 'day_of_week']

num_pipeline = make_pipeline(
    StandardScaler()  # Scale features
)
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    # Apply numeric pipeline to numeric attributes
])

In [39]:
transport_prepared = preprocessing.fit_transform(demand)
transport_prepared

array([[-0.28420307, -1.21082781,  0.37792462,  1.51306249],
       [ 0.04477485,  0.21021648,  1.29241509,  1.0017459 ],
       [-0.45958577, -0.58837285,  1.29241509,  1.0017459 ],
       ...,
       [-0.07390843,  0.27601313,  1.14000001, -0.02088728],
       [-0.29254122,  0.09154919,  0.98758493, -1.55483706],
       [-0.30677022, -0.74234167, -0.2317357 ,  1.51306249]])

In [40]:
preprocessing.get_feature_names_out()

array(['num__departure_long', 'num__departure_lat', 'num__hour_of_day',
       'num__day_of_week'], dtype=object)

In [41]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Select features and target variable for classification
features = ['departure_long', 'departure_lat', 'hour_of_day', 'day_of_week']
X_train_class = demand[features]
y_train_class = demand_labels

# Initialize SVC model
svc_model = SVC(kernel='linear', C=1.0)

# Train the classification model
svc_model.fit(X_train_class, y_train_class)

# Make predictions on the test set
X_test_class = test_set.drop(["demand_category", "occupancy", "arrival_lat", "arrival_long", "travel_fee","departure_time"], axis=1)
y_test_class = test_set["demand_category"]
y_pred_class = svc_model.predict(X_test_class)

# Evaluate the classification model
accuracy = accuracy_score(y_test_class, y_pred_class)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test_class, y_pred_class))

Accuracy: 1.0
              precision    recall  f1-score   support

        high       1.00      1.00      1.00     34671
         low       1.00      1.00      1.00     13207
      medium       1.00      1.00      1.00     52122

    accuracy                           1.00    100000
   macro avg       1.00      1.00      1.00    100000
weighted avg       1.00      1.00      1.00    100000

