In [6]:
import pandas as pd

file = r'C:\Users\girip\Downloads\TRAIN.csv'   # <- put your real path here
dataset = pd.read_csv(file)

dataset.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


##Feature Engineering

In [7]:
# ensure Date is datetime, sort by Store_id & Date, then create features and rolling mean
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset = dataset.sort_values(['Store_id', 'Date']).reset_index(drop=True)

dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['DayOfWeek'] = dataset['Date'].dt.dayofweek

dataset['Sales_Rolling_Mean_7D'] = dataset.groupby('Store_id')['Sales'] \
    .transform(lambda x: x.rolling(window=7, min_periods=1).mean())

#Data Transformation

In [8]:
from sklearn.preprocessing import StandardScaler

# Identify numerical features to be scaled
numerical_features = ['#Order', 'Sales', 'Sales_Rolling_Mean_7D']

# Initialize StandardScaler
scaler = StandardScaler()

# Apply the scaler to the selected numerical features and update the dataset
dataset[numerical_features] = scaler.fit_transform(dataset[numerical_features])

print("Numerical features scaled successfully.")

Numerical features scaled successfully.


In [9]:
import pandas as pd

# Identify categorical features to be one-hot encoded
categorical_features = ['Store_Type', 'Location_Type', 'Region_Code', 'Discount']

# Apply one-hot encoding
dataset = pd.get_dummies(dataset, columns=categorical_features, drop_first=True)

print("Categorical features one-hot encoded successfully.")

Categorical features one-hot encoded successfully.


In [10]:
X_train = dataset.drop(['Sales', 'ID', 'Date'], axis=1)
y_train = dataset['Sales']

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
final_model_lr = LinearRegression()
final_model_lr.fit(X_train, y_train)

print("Linear Regression model re-trained on the entire X_train and y_train datasets.")

Linear Regression model re-trained on the entire X_train and y_train datasets.


In [13]:
import joblib

# Save the trained Linear Regression model to a file
joblib.dump(final_model_lr, 'linear_regression_model.joblib')

print("Linear Regression model saved to 'linear_regression_model.joblib'.")

Linear Regression model saved to 'linear_regression_model.joblib'.


In [15]:
import joblib

# Save the fitted StandardScaler instance to a file
joblib.dump(scaler, 'scaler.joblib')
print("StandardScaler saved to 'scaler.joblib'.")

StandardScaler saved to 'scaler.joblib'.


In [16]:
import joblib

# Define the non-one-hot encoded columns (numerical + Holiday)
non_encoded_cols = ['Store_id', 'Year', 'Month', 'DayOfWeek', 'Holiday']

# Identify the one-hot encoded columns by excluding the non_encoded_cols from X_train's columns
encoded_columns = [col for col in X_train.columns if col not in non_encoded_cols]

# Save the list of encoded column names
joblib.dump(encoded_columns, 'encoded_columns.joblib')

print("Identified and saved list of one-hot encoded columns to 'encoded_columns.joblib'.")
print(f"Number of encoded columns: {len(encoded_columns)}")
print("First 5 encoded columns:", encoded_columns[:5])

Identified and saved list of one-hot encoded columns to 'encoded_columns.joblib'.
Number of encoded columns: 13
First 5 encoded columns: ['#Order', 'Sales_Rolling_Mean_7D', 'Store_Type_S2', 'Store_Type_S3', 'Store_Type_S4']


In [None]:
from flask import Flask, request, jsonify
import joblib
import pandas as pd
import numpy as np

# 1. Load the pre-trained model, scaler, and encoded columns
final_model_lr = joblib.load('linear_regression_model.joblib')
scaler = joblib.load('scaler.joblib')
encoded_columns = joblib.load('encoded_columns.joblib')

# Define the non-encoded numerical columns that need scaling
numerical_features_for_scaling = ['Store_id', 'Year', 'Month', 'DayOfWeek']

# Define the categorical columns that were one-hot encoded
categorical_features_for_encoding = ['Store_Type', 'Location_Type', 'Region_Code', 'Discount']

# 2. Define the preprocessing function
def preprocess_input(data):
    # Convert the input dictionary to a pandas DataFrame
    input_df = pd.DataFrame([data])

    # Convert 'Date' column to datetime objects
    input_df['Date'] = pd.to_datetime(input_df['Date'])

    # Extract 'Year', 'Month', and 'DayOfWeek' features
    input_df['Year'] = input_df['Date'].dt.year
    input_df['Month'] = input_df['Date'].dt.month
    input_df['DayOfWeek'] = input_df['Date'].dt.dayofweek

    # Drop the original 'Date' column as it's no longer needed
    input_df = input_df.drop('Date', axis=1)

    # Apply one-hot encoding to categorical features
    input_df_encoded = pd.get_dummies(input_df, columns=categorical_features_for_encoding, drop_first=True)

    # Align columns with the training data's encoded columns
    # First, identify all columns that should be present in the final DataFrame
    expected_columns = numerical_features_for_scaling + ['Holiday'] + encoded_columns
    
    # Reindex the input_df_encoded to match the expected columns. Fill missing with 0 and remove extra.
    input_df_final = input_df_encoded.reindex(columns=expected_columns, fill_value=0)

    # Scale the numerical features
    input_df_final[numerical_features_for_scaling] = scaler.transform(input_df_final[numerical_features_for_scaling])

    return input_df_final

# 3. Initialize the Flask application
app = Flask(__name__)

# 4. Create a prediction endpoint
@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json(force=True)
        preprocessed_data = preprocess_input(data)
        prediction = final_model_lr.predict(preprocessed_data)
        return jsonify({'predicted_sales': prediction[0].item()})
    except Exception as e:
        return jsonify({'error': str(e)}), 400

# To run the Flask app, you would typically save this code as app.py and run `flask run`
# For demonstration purposes, we can include a main block here, but it's not ideal for production.
if __name__ == '__main__':
    print("Flask app loaded. You can run this file to start the development server.")
    print("Example usage (if running locally): curl -X POST -H \"Content-Type: application/json\" -d '{\"Store_id\": 1, \"Store_Type\": \"S1\", \"Location_Type\": \"L1\", \"Region_Code\": \"R1\", \"Date\": \"2019-06-01\", \"Holiday\": 0, \"Discount\": \"No\"}' http://127.0.0.1:5000/predict")
    # app.run(debug=True) # Uncomment to run the app, debug=True for development