In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from joblib import dump, load
import numpy as np

# Step 1: Load and clean the dataset
def load_and_clean_data(filename):
    df = pd.read_csv(filename)
    
    # Create new features
    df['travel_date'] = pd.to_datetime(df['travel_date'], format='%d-%m-%y')
    df['Year'] = df['travel_date'].dt.year
    df['Month'] = df['travel_date'].dt.month
    df['Day'] = df['travel_date'].dt.day
    
    # Drop unnecessary columns
    df = df.drop(columns=['seat_number', 'payment_receipt', 'travel_date', 'travel_time', 'travel_to', 'payment_method'])
    
    # Add a new column: number_of_ticket
    df['number_of_ticket'] = df.groupby(['travel_from', 'car_type', 'max_capacity', 'Year', 'Month', 'Day', 'Hour', 'Minutes'])['travel_from'].transform('count')

    return df

# Step 2: Encode categorical features
def encode_categorical_features(df):
    label_encoder = LabelEncoder()
    df['travel_from'] = label_encoder.fit_transform(df['travel_from'])
    df['car_type'] = label_encoder.fit_transform(df['car_type'])
    return df

# Step 3: Feature Scaling
def scale_features(X):
    mean = np.mean(X, axis=0)
    std_dev = np.std(X, axis=0)
    return (X - mean) / std_dev

# Step 4: Split the data into training and testing sets
def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the model
def train_model(X_train, y_train):
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    return model

# Step 6: Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    return y_pred

# Step 7: Save the trained model
def save_model(model, filename):
    dump(model, filename)
    
# Step 8: Load the saved model
def load_saved_model(filename):
    return load(filename)


In [7]:
# Load and clean the data
df = load_and_clean_data('train_revised.csv')

# Encode categorical features
df = encode_categorical_features(df)

# Select features and target variable
X = df[['travel_from', 'car_type', 'max_capacity', 'Year', 'Month', 'Day', 'Hour', 'Minutes']]
y = df['number_of_ticket']

# Scale the features
X_scaled = scale_features(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = split_data(X_scaled, y)

# Train the model
model = train_model(X_train, y_train)

# Evaluate the model
evaluate_model(model, X_test, y_test)

# Save the model
save_model(model, 'random_forest_model.joblib')


  df['Year'] = pd.to_datetime(df['travel_date'], errors='coerce').dt.year
  df['Month'] = pd.to_datetime(df['travel_date'], errors='coerce').dt.month
  df['Day'] = pd.to_datetime(df['travel_date'], errors='coerce').dt.day


Mean Squared Error: 1.0415183464033304


In [8]:
# Load and clean the test data
test_df = load_and_clean_data('test_questions.csv')

# Encode categorical features in the test data
test_df = encode_categorical_features(test_df)

# Select features from the test data
X_test_data = test_df[['travel_from', 'car_type', 'max_capacity', 'Year', 'Month', 'Day', 'Hour', 'Minutes']]

# Scale the features for the test data
X_test_data_scaled = scale_features(X_test_data)

# Load the saved model
model_loaded = load_saved_model('random_forest_model.joblib')

# Make predictions using the loaded model
test_predictions = model_loaded.predict(X_test_data_scaled)

# Print predictions
print("Test Predictions:", test_predictions)


KeyError: "['seat_number', 'payment_receipt', 'payment_method'] not found in axis"