## Data Exploration and Preprocessing:
1: Import Libraries and Set Up the Environment

In [49]:
# Import necessary libraries
import pandas as pd
import os

# Print the current working directory to ensure we are in the correct folder
print("Current working directory:", os.getcwd())


Current working directory: c:\myproject\env\transactions.csv


In [60]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn

## 2: Load the Datasets

In [62]:
# Load the datasets
train_df = pd.read_csv(r'C:\myproject\env\train.csv\train.csv')
transactions_df = pd.read_csv(r'C:\myproject\env\transactions.csv\transactions.csv')
test_df = pd.read_csv(r'C:\myproject\env\test.csv')  
us_retail_sales_df = pd.read_csv(r'C:\myproject\env\us-retail-sales.csv')



# Preview the first few rows of each dataset to understand their structure
print("\nTrain Dataset:")
print(train_df.head(10))

print("\nTransactions Dataset:")
print(transactions_df.head(10))

print("\nTest Dataset:")
print(test_df.head(10))

print("\nUS Retail Sales Dataset:")
print(us_retail_sales_df.head(10))



Train Dataset:
   id        date  store_nbr        family  sales  onpromotion
0   0  2013-01-01          1    AUTOMOTIVE    0.0            0
1   1  2013-01-01          1     BABY CARE    0.0            0
2   2  2013-01-01          1        BEAUTY    0.0            0
3   3  2013-01-01          1     BEVERAGES    0.0            0
4   4  2013-01-01          1         BOOKS    0.0            0
5   5  2013-01-01          1  BREAD/BAKERY    0.0            0
6   6  2013-01-01          1   CELEBRATION    0.0            0
7   7  2013-01-01          1      CLEANING    0.0            0
8   8  2013-01-01          1         DAIRY    0.0            0
9   9  2013-01-01          1          DELI    0.0            0

Transactions Dataset:
         date  store_nbr  transactions
0  2013-01-01         25           770
1  2013-01-02          1          2111
2  2013-01-02          2          2358
3  2013-01-02          3          3487
4  2013-01-02          4          1922
5  2013-01-02          5          

In [86]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
df = pd.read_csv('C:\myproject\env\transactions.csv\transactions.csv')  # Replace with your dataset path

# Check the first few rows
print(df.head())

# Sales Distribution
plt.figure(figsize=(10,6))
sns.histplot(df['sales'], kde=True, color='blue')
plt.title('Sales Distribution')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()

# Sales vs. OnPromotion
plt.figure(figsize=(10,6))
sns.boxplot(x='onpromotion', y='sales', data=df, palette='Set2')
plt.title('Sales vs. On Promotion')
plt.xlabel('On Promotion')
plt.ylabel('Sales')
plt.show()

# Sales by Store Number (barplot)
plt.figure(figsize=(12,6))
sns.barplot(x='store_nbr', y='sales', data=df, palette='viridis')
plt.title('Average Sales per Store')
plt.xlabel('Store Number')
plt.ylabel('Average Sales')
plt.show()

# Sales vs. Family Type
plt.figure(figsize=(12,6))
sns.boxplot(x='family', y='sales', data=df, palette='Spectral')
plt.title('Sales Distribution by Family Type')
plt.xlabel('Family')
plt.ylabel('Sales')
plt.xticks(rotation=90)  # Rotate for better visibility
plt.show()


  df = pd.read_csv('C:\myproject\env\transactions.csv\transactions.csv')  # Replace with your dataset path
  df = pd.read_csv('C:\myproject\env\transactions.csv\transactions.csv')  # Replace with your dataset path


OSError: [Errno 22] Invalid argument: 'C:\\myproject\\env\transactions.csv\transactions.csv'

## 3: Check for Missing Values in the Datasets

In [63]:
# 3. Data Preprocessing
# Handle missing values
train_df.fillna(0, inplace=True)
transactions_df.fillna(0, inplace=True)
us_retail_sales_df.fillna(0, inplace=True)

## 4: Convert Date Columns to Datetime Format
Since date columns are often in string format, converting them to datetime ensures proper handling of time-based operations.

In [64]:
# Convert date columns to datetime format
train_df['date'] = pd.to_datetime(train_df['date'])
transactions_df['date'] = pd.to_datetime(transactions_df['date'])
us_retail_sales_df['Month'] = pd.to_datetime(us_retail_sales_df['Month'], format='%Y-%m')


In [65]:
# 4. Merge Datasets
merged_df = pd.merge(train_df, transactions_df, on=['date', 'store_nbr'], how='left')
merged_df = pd.merge(merged_df, us_retail_sales_df, left_on='date', right_on='Month', how='left')
merged_df.drop(columns=['Month'], inplace=True)


In [66]:

# Fill any remaining missing values
merged_df['transactions'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['transactions'].fillna(0, inplace=True)


In [67]:
# 5. Feature Engineering
# Extract date-related features
merged_df['year'] = merged_df['date'].dt.year
merged_df['month'] = merged_df['date'].dt.month
merged_df['day'] = merged_df['date'].dt.day
merged_df['day_of_week'] = merged_df['date'].dt.weekday
merged_df['quarter'] = merged_df['date'].dt.quarter

In [68]:
# Select numerical columns for scaling
numerical_cols = ['sales', 'transactions']
scaler = StandardScaler()
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols])


In [69]:
# 6. Define Features and Target
X = merged_df[['store_nbr', 'transactions', 'year', 'month', 'day', 'day_of_week', 'quarter']]
y = merged_df['sales']

In [70]:
# 7. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [71]:
# 8. Model Training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [72]:
# 9. Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-Squared: {r2}")

Mean Squared Error: 1.0133439395649706
R-Squared: 0.009823217647214055


In [87]:
from sklearn.ensemble import GradientBoostingRegressor
model_gb = GradientBoostingRegressor(random_state=42)
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)

print(f"Mean Squared Error (GB): {mean_squared_error(y_test, y_pred_gb)}")
print(f"R-Squared (GB): {r2_score(y_test, y_pred_gb)}")


Mean Squared Error (GB): 0.953064332196099
R-Squared (GB): 0.06872470739374814


In [73]:
# Save the trained model and scaler for later use in the API.
joblib.dump(model, "sales_forecast_model.joblib")
joblib.dump(scaler, "scaler.joblib")


['scaler.joblib']

In [74]:
# 10. FastAPI Setup
app = FastAPI()

# Define input data structure using Pydantic for validation
class SalesPredictionInput(BaseModel):
    store_nbr: int
    transactions: float
    year: int
    month: int
    day: int
    day_of_week: int
    quarter: int

In [75]:
# Load the saved model and scaler for use in the prediction function
loaded_model = joblib.load("sales_forecast_model.joblib")
loaded_scaler = joblib.load("scaler.joblib")


In [76]:
# Define a prediction endpoint
@app.post("/predict_sales")
def predict_sales(input_data: SalesPredictionInput):
    # Prepare input data and scale the transactions feature
    data = [[
        input_data.store_nbr,
        input_data.transactions,
        input_data.year,
        input_data.month,
        input_data.day,
        input_data.day_of_week,
        input_data.quarter
    ]]
    
    # Apply scaling to the transactions field
    data[0][1] = loaded_scaler.transform([[input_data.transactions]])[0][0]
    
    # Generate a prediction using the model
    prediction = loaded_model.predict(data)
    return {"predicted_sales": prediction[0]}


In [78]:
import joblib
from sklearn.ensemble import RandomForestRegressor

# Assuming 'model' is your trained RandomForestRegressor or other model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)  # X_train, y_train should be your training data

# Save the trained model
joblib.dump(model, 'sales_model.pkl')


['sales_model.pkl']

In [80]:
import os
print(os.listdir())  # This will show the files in the current directory


['# 1.py', 'sales_forecast_model.joblib', 'sales_model.pkl', 'scaler.joblib', 'transactions.csv', 'Untitled-1.ipynb']


In [81]:
# Load the trained model
model = joblib.load('sales_model.pkl')

# Make predictions with the model
# Assuming X_test is your test data
predictions = model.predict(X_test)
print(predictions)


[ 0.45381799 -0.32466111  0.06309749 ... -0.25014475  0.1609063
  0.41452831]


In [82]:
# Import necessary libraries
from fastapi import FastAPI
from pydantic import BaseModel
import joblib

# Load your trained model
model = joblib.load('sales_model.pkl')

# Define a Pydantic model for input data
class ModelInput(BaseModel):
    feature1: float
    feature2: float

# Initialize FastAPI app
app = FastAPI()

# Define an endpoint for predictions
@app.post("/predict")
def predict(input_data: ModelInput):
    # Create a DataFrame or array for model input
    data = [[input_data.feature1, input_data.feature2]]
    prediction = model.predict(data)
    return {"prediction": prediction[0]}



In [None]:
import joblib
from sklearn.ensemble import RandomForestRegressor

# Assuming 'model' is your trained RandomForestRegressor or other model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)  # X_train, y_train should be your training data

# Save the trained model
joblib.dump(model, 'sales_model.pkl')


['sales_model.pkl']

In [84]:
from fastapi import FastAPI

# Create FastAPI app
app = FastAPI()

# Define a root endpoint
@app.get("/")
def read_root():
    return {"message": "Hello, FastAPI!"}


In [85]:
from fastapi import FastAPI

# Initialize FastAPI app
app = FastAPI()

# Define a root endpoint
@app.get("/")
def read_root():
    return {"message": "Hello, FastAPI!"}
