<a href="https://colab.research.google.com/github/Soul2018/MLE-Mini-Project/blob/main/MLE_Mini_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

#Load the dataset into a pandas DataFrame
df = pd.read_parquet('/content/yellow_tripdata_2025-01.parquet')

# Display the first few rows of the dataset
print(df.head(5))

# Drop rows with missing values.
df = df.dropna()

# Create new feature, 'trip_duration
  #here we copy of the DataFrame in order to avoid SettingWithCopyWarning
df = df.copy()
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

print(df[['trip_duration']].head())

# Create a list called feature_col to store column names
feature_cols = df.columns.to_list()
print(feature_cols)

# Split dataset into training and test sets

  #let's define the feature and target
X = df.drop(columns=['fare_amount'])
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape", X_train.shape, y_train.shape)
print("Test set shape", X_test.shape, y_test.shape)

# Create a baseline for mean absolute error of total amount
# let's create a model that always predicts the mean total fare of the training dataset

mean_total_fare = y_train.mean();
class PredictMean:
  def predicator(self, X):
    return np.full(shape=(len(X),), fill_value=mean_total_fare)
# model instantiation
mean_model = PredictMean();

# let's make predictions on test data
y_predict = mean_model.predicator(X_test)
print("Predicted mean on test set", y_predict)

#now we can calculate the mean absolute error
mean_absolute_err = mean_absolute_error(y_test, y_predict)

print("Mean absolute error: ", mean_absolute_err)

# Use Scikit-Learn's ColumnTransformer to preprocess the categorical and
# continuous features independently.

#let's define the categorical features
categorical_features = ["store_and_fwd_flag"]
#let's define the continue features
continuous_features = ["tpep_dropoff_datetime"]

# Create a pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# create a pipeline for continuous features
continuous_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# let's combine the pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('category', categorical_transformer, categorical_features),
        ("continuous", continuous_transformer, continuous_features)
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_predict = pipeline.predict(X_test)

print("Predicted value", y_predict[:5])

# Build random forest regressor model

#let's create a Randon Forest model instance
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# trainig model
# Convert datetime columns to numerical features
X_train['pickup_hour'] = X_train['tpep_pickup_datetime'].dt.hour
X_train['dropoff_hour'] = X_train['tpep_dropoff_datetime'].dt.hour
X_train['trip_duration'] = (X_train['tpep_dropoff_datetime'] - X_train['tpep_pickup_datetime']).dt.total_seconds() / 60

X_test['pickup_hour'] = X_test['tpep_pickup_datetime'].dt.hour
X_test['dropoff_hour'] = X_test['tpep_dropoff_datetime'].dt.hour
X_test['trip_duration'] = (X_test['tpep_dropoff_datetime'] - X_test['tpep_pickup_datetime']).dt.total_seconds() / 60

# Drop the original datetime columns
X_train = X_train.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
X_test = X_test.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

# One-hot encode the 'store_and_fwd_flag' column
X_train = pd.get_dummies(X_train, columns=['store_and_fwd_flag'], drop_first=True)  
X_test = pd.get_dummies(X_test, columns=['store_and_fwd_flag'], drop_first=True)  


rf_model.fit(X_train, y_train)

# predict on test data set
y_pred_rf = rf_model.predict(X_test)

#mean absolute error
mean_absolute_err_rf = mean_absolute_error(y_test, y_pred_rf)

print("Random Forest Regressor Mean Absolute Error:", mean_absolute_err_rf)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

#Load the dataset into a pandas DataFrame
df = pd.read_parquet('/content/yellow_tripdata_2025-01.parquet')

# Display the first few rows of the dataset
print(df.head(5))

# Drop rows with missing values.
df = df.dropna()

# Create new feature, 'trip_duration
  #here we copy of the DataFrame in order to avoid SettingWithCopyWarning
df = df.copy()
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['dropoff_hour'] = df['tpep_dropoff_datetime'].dt.hour
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

print(df[['trip_duration']].head())

# Create a list called feature_col to store column names
feature_cols = df.columns.to_list()
print(feature_cols)

# Split dataset into training and test sets

  #let's define the feature and target
X = df.drop(columns=['fare_amount'])
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape", X_train.shape, y_train.shape)
print("Test set shape", X_test.shape, y_test.shape)

# Create a baseline for mean absolute error of total amount
# let's create a model that always predicts the mean total fare of the training dataset

mean_total_fare = y_train.mean();
class PredictMean:
  def predicator(self, X):
    return np.full(shape=(len(X),), fill_value=mean_total_fare)
# model instantiation
mean_model = PredictMean();

# let's make predictions on test data
y_predict = mean_model.predicator(X_test)
print("Predicted mean on test set", y_predict)

#now we can calculate the mean absolute error
mean_absolute_err = mean_absolute_error(y_test, y_predict)

print("Mean absolute error: ", mean_absolute_err)

# Use Scikit-Learn's ColumnTransformer to preprocess the categorical and
# continuous features independently.

#let's define the categorical features
categorical_features = ["store_and_fwd_flag"]
#let's define the continue features
continuous_features = ["tpep_dropoff_datetime"]

# Create a pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# create a pipeline for continuous features
continuous_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# let's combine the pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('category', categorical_transformer, categorical_features),
        ("continuous", continuous_transformer, continuous_features)
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_predict = pipeline.predict(X_test)

print("Predicted value", y_predict[:5])

# Build random forest regressor model

#let's create a Randon Forest model instance
rf_model = RandomForestRegressor(n_estimators=10, random_state=42, max_depth=2)

# trainig model

# Drop the original datetime columns
X_train = X_train.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
X_test = X_test.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

# One-hot encode the 'store_and_fwd_flag' column
X_train = pd.get_dummies(X_train, columns=['store_and_fwd_flag'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['store_and_fwd_flag'], drop_first=True)


rf_model.fit(X_train, y_train)

# predict on test data set
y_pred_rf = rf_model.predict(X_test)

#mean absolute error
mean_absolute_err_rf = mean_absolute_error(y_test, y_pred_rf)

print("Random Forest Regressor Mean Absolute Error:", mean_absolute_err_rf)

# Define the hyperparameters to tune.
param_grid = {
    'n_estimators': [10],
    'max_depth': [2],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['auto'],
    'bootstrap': [True]
}
# Perform grid search to find the best hyperparameters. This could take a while
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           scoring='neg_mean_absolute_error', cv=3, verbose=2, n_jobs=-1)

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2025-01-01 00:18:38   2025-01-01 00:26:59              1.0   
1         1  2025-01-01 00:32:40   2025-01-01 00:35:13              1.0   
2         1  2025-01-01 00:44:04   2025-01-01 00:46:01              1.0   
3         2  2025-01-01 00:14:27   2025-01-01 00:20:01              3.0   
4         2  2025-01-01 00:21:34   2025-01-01 00:25:06              3.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           1.60         1.0                  N           229           237   
1           0.50         1.0                  N           236           237   
2           0.60         1.0                  N           141           141   
3           0.52         1.0                  N           244           244   
4           0.66         1.0                  N           244           116   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \
