In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import datetime
import joblib
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
data = pd.read_csv('C:/Users/User/Desktop/Week-4/notebooks/cleaned_data.csv')

data.replace('None', np.nan, inplace=True)

data['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
data['CompetitionOpenSinceYear'].fillna(0, inplace=True)
data['Promo2SinceWeek'].fillna(0, inplace=True)
data['Promo2SinceYear'].fillna(0, inplace=True)
data['CompetitionDistance'].fillna(data['CompetitionDistance'].median(), inplace=True)
data['PromoInterval'].fillna('None', inplace=True)

data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')

data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

data.drop(columns=['Date'], inplace=True)

label_cols = ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']

for col in label_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

print(data.isnull().sum())

X = data.drop(columns=['Sales'])  
y = data['Sales']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")

best_model.fit(X_train, y_train)

timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
model_filename = f'random_forest_model_{timestamp}.pkl'
joblib.dump(best_model, model_filename)

predictions = best_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Model Performance:\nMSE: {mse}\nR^2: {r2}')

cv_scores = cross_val_score(best_model, X, y, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Average cross-validation score: {np.mean(cv_scores)}')

importances = best_model.feature_importances_
feature_names = X.columns

feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importances')
plt.show()


In [None]:
from flask import Flask, request, jsonify


model_filename = 'random_forest_model_YYYY-MM-DD-HH-MM-SS.pkl'  
model = joblib.load(model_filename)

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json

    input_data = pd.DataFrame(data)

    label_cols = ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']
    for col in label_cols:
        le = LabelEncoder()
        input_data[col] = le.fit_transform(input_data[col])

    predictions = model.predict(input_data)

    return jsonify(predictions.tolist())

if __name__ == '__main__':
    app.run(debug=True)
