In [116]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from joblib import dump, load

In [117]:
def preprocess_data(df):

    df['travel_date'] = pd.to_datetime(df['travel_date'], format='%d-%m-%y')
    df['Year'] = df['travel_date'].dt.year
    df['Month'] = df['travel_date'].dt.month
    df['Day'] = df['travel_date'].dt.day
    df['Hour'] = pd.to_datetime(df['travel_time'], format='%H:%M').dt.hour
    df['Minutes'] = pd.to_datetime(df['travel_time'], format='%H:%M').dt.minute
    

    df['number_of_ticket'] = df.groupby(
        ['travel_from', 'car_type', 'max_capacity', 'Year', 'Month', 'Day', 'Hour', 'Minutes']
    )['travel_from'].transform('count')
    

    label_encoder = preprocessing.LabelEncoder()
    df['travel_from'] = label_encoder.fit_transform(df['travel_from'])
    df['car_type'] = label_encoder.fit_transform(df['car_type'])
    
    return df

In [None]:

def standardize_data(X):

    mean = np.mean(X, axis=0)
    std_dev = np.std(X, axis=0)
    X = (X - mean) / std_dev
    return X

In [119]:
def train_model(X, Y):

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
    
    model = RandomForestRegressor()
    model.fit(x_train, y_train)
    
    importances = model.feature_importances_
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    

    selector = SelectFromModel(model, threshold=0.03, prefit=True) 
    X_selected = selector.transform(X)
    

    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    model_score = model.score(x_train, y_train)
    
    return model, importance_df, X_selected, mse, model_score


In [120]:
# Main script
if __name__ == "__main__":
    df = pd.read_csv('train_revised.csv')
    
    df = preprocess_data(df)
    
    X = df[['travel_from', 'car_type', 'max_capacity', 'Year', 'Month', 'Day', 'Hour', 'Minutes']]
    Y = df['number_of_ticket']

    X = standardize_data(X)

    model, importance_df, X_selected, mse, model_score = train_model(X, Y)


    dump(model, 'random_forest_model.joblib')


    # print("Feature Importance:")
    # print(importance_df)
    print(f"Mean Squared Error: {mse}")
    print(f"Model Accuracy: {model_score}")



Mean Squared Error: 0.8483973375931843
Model Accuracy: 0.9990749537844216
