In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from joblib import dump


In [86]:
# Function to preprocess the data
def preprocess_data(df):
    # Parse dates and extract date/time features
    df['travel_date'] = pd.to_datetime(df['travel_date'], format='%d-%m-%y')
    df['Year'] = df['travel_date'].dt.year
    df['Month'] = df['travel_date'].dt.month
    df['Day'] = df['travel_date'].dt.day
    df['Hour'] = pd.to_datetime(df['travel_time'], format='%H:%M').dt.hour
    df['Minutes'] = pd.to_datetime(df['travel_time'], format='%H:%M').dt.minute
    
    # Encode categorical features
    label_encoder = LabelEncoder()
    df['travel_from'] = label_encoder.fit_transform(df['travel_from'])
    df['car_type'] = label_encoder.fit_transform(df['car_type'])
    
    return df

In [87]:
# Function to standardize the features
def standardize_features(X):
    mean = np.mean(X, axis=0)
    std_dev = np.std(X, axis=0)
    X_standardized = (X - mean) / std_dev
    return X_standardized

In [88]:
# Function to train the model
def train_model(X, Y, test_size=0.20, random_state=42):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
    model = RandomForestRegressor(random_state=random_state)
    model.fit(x_train, y_train)
    return model, x_train, x_test, y_train, y_test

In [89]:
# Function to evaluate the model
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse, y_pred

In [90]:
# Function to get feature importance and select features
def select_features(model, X, threshold=0.03):
    importances = model.feature_importances_
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
    print(importance_df)
    selector = SelectFromModel(model, threshold=threshold, prefit=True)
    X_selected = selector.transform(X)
    selected_features = X.columns[selector.get_support()]


In [91]:
# Main workflow
def main(df):
    # Preprocess the data
    df = preprocess_data(df)

    df['number_of_ticket'] = df.groupby(['travel_from', 'car_type', 'max_capacity', 'Year', 'Month', 'Day', 'Hour', 'Minutes'])['travel_from'].transform('count')
    
    # Select features and target
    X = df[['travel_from', 'car_type', 'max_capacity', 'Year', 'Month', 'Day', 'Hour', 'Minutes']]
    Y = df['number_of_ticket']
    
    # Standardize the features
    X = standardize_features(X)
    
    # Train the model
    model, x_train, x_test, y_train, y_test = train_model(X, Y)
    
    # Evaluate the model
    mse, y_pred = evaluate_model(model, x_test, y_test)
    print(f"Mean Squared Error: {mse}")
    
    # Feature importance and selection
    importance_df, selected_features, X_selected = select_features(model, X)
    
    # Save the model
    dump(model, 'random_forest_model.joblib')
    
    return mse

In [92]:
if __name__ == "__main__":
    # Assuming you have a CSV or DataFrame to load
    # df = pd.read_csv('your_data.csv')  # Un-comment this if reading from a file
    df = pd.read_csv('train_revised.csv')  # Example DataFrame, replace with your actual data

    mse = main(df)

    print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.8359956142898635
        Feature  Importance
0   travel_from    0.263379
5           Day    0.190441
7       Minutes    0.181480
2  max_capacity    0.145026
1      car_type    0.128270
4         Month    0.068994
6          Hour    0.018462
3          Year    0.003947




TypeError: cannot unpack non-iterable NoneType object