<a href="https://colab.research.google.com/github/ShrieVarshini2004/Taxi_Tip_Prediction/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Import necessary libraries
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error
import time
import warnings
import gc, os
warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the file path
file_path = '/content/drive/My Drive/yellow_tripdata_2019-06.parquet'

# Check if the file exists
if not os.path.exists(file_path):
    print(f"⚠️ File not found at {file_path}. Uploading manually...")
    from google.colab import files
    uploaded = files.upload()  # Prompt user to upload file
    file_path = 'yellow_tripdata_2019-06.parquet'  # Set new path

# Read the Parquet file correctly
raw_data = pd.read_parquet(file_path)

# Print dataset info
print(f"There are {len(raw_data)} observations in the dataset.")
print(f"There are {len(raw_data.columns)} variables in the dataset.")

# Data cleaning
raw_data = raw_data[raw_data['tip_amount'] > 0]  # Remove zero-tip trips
raw_data = raw_data[raw_data['tip_amount'] <= raw_data['fare_amount']]  # Remove unrealistic tips
raw_data = raw_data[(raw_data['fare_amount'] >= 2) & (raw_data['fare_amount'] < 200)]  # Filter valid fares
clean_data = raw_data.drop(['total_amount'], axis=1)  # Drop total_amount column
del raw_data
gc.collect()  # Free memory

# Convert timestamps to datetime objects
clean_data['tpep_dropoff_datetime'] = pd.to_datetime(clean_data['tpep_dropoff_datetime'])
clean_data['tpep_pickup_datetime'] = pd.to_datetime(clean_data['tpep_pickup_datetime'])

# Extract time-based features
clean_data['pickup_hour'] = clean_data['tpep_pickup_datetime'].dt.hour
clean_data['dropoff_hour'] = clean_data['tpep_dropoff_datetime'].dt.hour
clean_data['pickup_day'] = clean_data['tpep_pickup_datetime'].dt.weekday
clean_data['dropoff_day'] = clean_data['tpep_dropoff_datetime'].dt.weekday
clean_data['trip_time'] = (clean_data['tpep_dropoff_datetime'] - clean_data['tpep_pickup_datetime']).dt.total_seconds()

# Reduce dataset size (optional)
clean_data = clean_data.head(200000)

# Drop datetime columns
clean_data = clean_data.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

# One-hot encode categorical features
categorical_columns = ["VendorID", "RatecodeID", "store_and_fwd_flag",
                        "PULocationID", "DOLocationID", "payment_type",
                        "pickup_hour", "dropoff_hour", "pickup_day", "dropoff_day"]
proc_data = pd.get_dummies(clean_data, columns=categorical_columns)

# Free memory
del clean_data
gc.collect()

# Fill NaN values with 0
proc_data.fillna(0, inplace=True)

# Extract target variable
y = proc_data[['tip_amount']].values.astype('float32')
proc_data = proc_data.drop(['tip_amount'], axis=1)

# Convert feature matrix to NumPy array and normalize
X = normalize(proc_data.values, axis=1, norm='l1', copy=False)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Decision Tree using Scikit-Learn
from sklearn.tree import DecisionTreeRegressor
sklearn_dt = DecisionTreeRegressor(max_depth=8, random_state=35)
t0 = time.time()
sklearn_dt.fit(X_train, y_train)
sklearn_time = time.time() - t0
print("[Scikit-Learn] Training time (s):", round(sklearn_time, 5))

# Train Decision Tree using Snap ML
from snapml import DecisionTreeRegressor
snapml_dt = DecisionTreeRegressor(max_depth=8, random_state=45, n_jobs=4)
t0 = time.time()
snapml_dt.fit(X_train, y_train)
snapml_time = time.time() - t0
print("[Snap ML] Training time (s):", round(snapml_time, 5))

# Training speedup comparison
print('[Decision Tree Regressor] Snap ML vs. Scikit-Learn speedup:', round(sklearn_time / snapml_time, 2), "x")

# Evaluate models
sklearn_pred = sklearn_dt.predict(X_test)
snapml_pred = snapml_dt.predict(X_test)
print('[Scikit-Learn] MSE:', round(mean_squared_error(y_test, sklearn_pred), 3))
print('[Snap ML] MSE:', round(mean_squared_error(y_test, snapml_pred), 3))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
There are 6971560 observations in the dataset.
There are 19 variables in the dataset.
[Scikit-Learn] Training time (s): 9.32961
[Snap ML] Training time (s): 4.07858
[Decision Tree Regressor] Snap ML vs. Scikit-Learn speedup: 2.29 x
[Scikit-Learn] MSE: 1.635
[Snap ML] MSE: 1.611


In [5]:
from google.colab import drive
drive.mount('/content/drive')
import os
print(os.listdir('/content/drive/My Drive/'))
print(os.listdir('/content/drive/My Drive/Colab Notebooks/'))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['Shrie_Varshini__-_ (2).pdf', 'Shrie_Varshini__-_ (1).pdf', 'Shrie_Varshini__-_.pdf', 'shrievarshini_cce.pdf', 'WhatsApp Image 2024-07-09 at 17.38.31_3f6d6046.jpg', 'Colab Notebooks', 'Future Tech AI Hackathon - Presentation Template.pptx', 'Advitiya (1).pdf', 'Advitiya.pdf', 'kidney_ct_scan_model', 'archive (1)', 'symbipredict_2022.csv', 'yellow_tripdata_2019-06.parquet', 'cell_samples.csv']
['project initial.ipynb', 'diffusion.ipynb', 'KNN', 'Linear Regression.ipynb', 'Untitled0.ipynb', 'GAN', 'kidney_disease_det.ipynb', 'Untitled1.ipynb', 'creditcard fraud.ipynb', 'Logistic regression.ipynb', 'SVM.ipynb', 'multiclass_prediction.ipynb', 'GridsearchCV.ipynb', 'k-means.ipynb', 'Untitled2.ipynb']


In [7]:
!pip install snapml


Collecting snapml
  Downloading snapml-1.16.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.6 kB)
Downloading snapml-1.16.2-cp311-cp311-manylinux_2_28_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: snapml
Successfully installed snapml-1.16.2


In [9]:
# Install Snap ML (only needed in Google Colab)
!pip install snapml

# Import necessary libraries
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error
import time
import warnings
import gc, os
warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the file path
file_path = '/content/drive/My Drive/yellow_tripdata_2019-06.parquet'

# Check if the file exists
if not os.path.exists(file_path):
    print(f"⚠️ File not found at {file_path}. Uploading manually...")
    from google.colab import files
    uploaded = files.upload()  # Prompt user to upload file
    file_path = 'yellow_tripdata_2019-06.parquet'  # Set new path

# Read the Parquet file correctly
raw_data = pd.read_parquet(file_path)

# Print dataset info
print(f"There are {len(raw_data)} observations in the dataset.")
print(f"There are {len(raw_data.columns)} variables in the dataset.")

# Data cleaning
raw_data = raw_data[raw_data['tip_amount'] > 0]  # Remove zero-tip trips
raw_data = raw_data[raw_data['tip_amount'] <= raw_data['fare_amount']]  # Remove unrealistic tips
raw_data = raw_data[(raw_data['fare_amount'] >= 2) & (raw_data['fare_amount'] < 200)]  # Filter valid fares
clean_data = raw_data.drop(['total_amount'], axis=1)  # Drop total_amount column
del raw_data
gc.collect()  # Free memory

# Convert timestamps to datetime objects
clean_data['tpep_dropoff_datetime'] = pd.to_datetime(clean_data['tpep_dropoff_datetime'])
clean_data['tpep_pickup_datetime'] = pd.to_datetime(clean_data['tpep_pickup_datetime'])

# Extract time-based features
clean_data['pickup_hour'] = clean_data['tpep_pickup_datetime'].dt.hour
clean_data['dropoff_hour'] = clean_data['tpep_dropoff_datetime'].dt.hour
clean_data['pickup_day'] = clean_data['tpep_pickup_datetime'].dt.weekday
clean_data['dropoff_day'] = clean_data['tpep_dropoff_datetime'].dt.weekday
clean_data['trip_time'] = (clean_data['tpep_dropoff_datetime'] - clean_data['tpep_pickup_datetime']).dt.total_seconds()

# Reduce dataset size (optional)
clean_data = clean_data.head(200000)

# Drop datetime columns
clean_data = clean_data.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

# One-hot encode categorical features
categorical_columns = ["VendorID", "RatecodeID", "store_and_fwd_flag",
                        "PULocationID", "DOLocationID", "payment_type",
                        "pickup_hour", "dropoff_hour", "pickup_day", "dropoff_day"]
proc_data = pd.get_dummies(clean_data, columns=categorical_columns)

# Free memory
del clean_data
gc.collect()

# Fill NaN values with 0
proc_data.fillna(0, inplace=True)

# Extract target variable
y = proc_data[['tip_amount']].values.astype('float32')
proc_data = proc_data.drop(['tip_amount'], axis=1)

# Convert feature matrix to NumPy array and normalize
X = normalize(proc_data.values, axis=1, norm='l1', copy=False)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Decision Tree using Scikit-Learn
from sklearn.tree import DecisionTreeRegressor
sklearn_dt = DecisionTreeRegressor(max_depth=8, random_state=35)
t0 = time.time()
sklearn_dt.fit(X_train, y_train)
sklearn_time = time.time() - t0
print("[Scikit-Learn] Training time (s):", round(sklearn_time, 5))

# Train Decision Tree using Snap ML
from snapml import DecisionTreeRegressor
snapml_dt = DecisionTreeRegressor(max_depth=8, random_state=45, n_jobs=4)
t0 = time.time()
snapml_dt.fit(X_train, y_train)
snapml_time = time.time() - t0
print("[Snap ML] Training time (s):", round(snapml_time, 5))

# Training speedup comparison
print('[Decision Tree Regressor] Snap ML vs. Scikit-Learn speedup:', round(sklearn_time / snapml_time, 2), "x")

# Evaluate models
sklearn_pred = sklearn_dt.predict(X_test)
snapml_pred = snapml_dt.predict(X_test)
print('[Scikit-Learn] MSE:', round(mean_squared_error(y_test, sklearn_pred), 3))
print('[Snap ML] MSE:', round(mean_squared_error(y_test, snapml_pred), 3))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
There are 6971560 observations in the dataset.
There are 19 variables in the dataset.
[Scikit-Learn] Training time (s): 9.05572
[Snap ML] Training time (s): 5.19263
[Decision Tree Regressor] Snap ML vs. Scikit-Learn speedup: 1.74 x
[Scikit-Learn] MSE: 1.635
[Snap ML] MSE: 1.611
