In [None]:
# Step 1: Data Acquisition
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Load Dataset
data = pd.read_csv('Walmart.csv')
print(data.head())

In [None]:
# Display basic information
print("Dataset Info:")
print(data.info())


In [None]:
print("\nSample Data:")
print(data.head())

In [None]:
# Step 2: Data Cleaning/Preprocessing
# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')  

# Extract useful time-based features
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['WeekOfYear'] = data['Date'].dt.isocalendar().week



In [None]:
# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())


In [None]:
print(data.isnull().sum())

In [None]:
# Remove duplicates if any
data.drop_duplicates(inplace=True)


In [None]:
# Handle missing values 
# Fill missing values using forward fill (or other methods)
data.fillna(method='ffill', inplace=True) 



In [None]:
# Encoding categorical variables
import pandas as pd

# Load dataset
walmart_data = pd.read_csv("Walmart.csv")  # Make sure this file exists

# Identify categorical and numerical columns dynamically
categorical_cols = walmart_data.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = walmart_data.select_dtypes(include=['number']).columns.tolist()

# Print identified columns
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)






In [None]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

# Load the dataset
df = pd.read_csv("Walmart.csv")  

# Convert Date column to datetime format (if not already converted)
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Define features (X) and target (y)
if 'Weekly_Sales' in df.columns:
    X = df.drop(columns=['Weekly_Sales', 'Date'])  # Remove target & date
    y = df['Weekly_Sales']

    # Apply Time Series Split
    tscv = TimeSeriesSplit(n_splits=5)
    
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Print dataset split details
    print("\nDataset Splits:")
    print("Training set shape:", X_train.shape)
    print("Test set shape:", X_test.shape)
else:
    print("Error: 'Weekly_Sales' column not found in dataset!")




In [None]:
# Feature Engineering: Extract date-related features
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['WeekOfYear'] = data['Date'].dt.isocalendar().week
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['IsHoliday'] = data['Holiday_Flag']




In [None]:
# Forward-fill Weekly_Sales as it's a time-series data
data['Weekly_Sales'].fillna(method='ffill', inplace=True)



In [None]:
# Add Lag Features
# Create lag features for Weekly Sales
for lag in [1, 2, 3, 4]:
    data[f'Lag_{lag}'] = data['Weekly_Sales'].shift(lag)

# Drop missing values after creating lag features
data.dropna(inplace=True)


In [None]:
# Splitting into Features and Target
X = data.drop(columns=['Weekly_Sales', 'Date'])  # Dropping 'Date' as it's not used in the model directly
y = data['Weekly_Sales']



In [None]:
# Standardize Specific Columns
def standardize_columns(df, columns):
    for col in columns:
        df[col] = (df[col] - df[col].mean()) / df[col].std()
    return df

numerical_cols = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
data = standardize_columns(data, numerical_cols)


In [None]:
# Outlier detection using Z-score
def remove_outliers_zscore(df, columns, threshold=3):
    for col in columns:
        z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
        df = df[z_scores < threshold]
    return df


In [None]:
numerical_cols = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
data = remove_outliers_zscore(data, numerical_cols)
print("\nSummary Statistics:\n", data.describe())



In [None]:
# Mean
print("\nMean:\n", data.mean(numeric_only=True))


In [None]:
#Median
print("\nMedian:\n", data.median(numeric_only=True))


In [None]:
#Standard Deviation
print("\nStandard Deviation:\n", data.std(numeric_only=True))


In [None]:
# Visualizations
# Histograms
data.hist(bins=20, figsize=(12, 8))
plt.suptitle("Feature Distributions")
plt.show()


In [None]:

# Correlation Matrix
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()



In [None]:
# Feature Distributions
features_to_plot = ['Temperature', 'Fuel_Price', 'Unemployment']
features_to_plot = [col for col in features_to_plot if col in walmart_data.columns]
if features_to_plot:
    plt.figure(figsize=(15, 5))
    for i, feature in enumerate(features_to_plot, 1):
        plt.subplot(1, len(features_to_plot), i)
        sns.histplot(walmart_data[feature], kde=True, bins=30)
        plt.title(f'{feature} Distribution')
    plt.show()
    

In [None]:

# Drop the correct column (replace 'Store' with the actual column name if different)
plt.figure(figsize=(12, 6))
sns.boxplot(data=data.drop(columns='Store'))  # Adjust 'Store' to the correct column name
plt.title("Box Plot of Numerical Features")
plt.show()



In [None]:
from sklearn.linear_model import LinearRegression  # Import model
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)  # Ensure X_train and y_train are defined

# Make predictions
predictions = model.predict(X_test)

# Ensure y_test is a Pandas Series (avoid shape issues)
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze()

# Calculate residuals (difference between actual and predicted values)
residuals = y_test - predictions

# Plot residuals
plt.figure(figsize=(8, 5))
sns.scatterplot(x=predictions, y=residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--')  
plt.xlabel('Predicted Sales')
plt.ylabel('Residuals')
plt.title('Residual Plot - Walmart Sales Prediction')
plt.show()


In [None]:
# Time Series Trend Analysis
plt.figure(figsize=(12, 6))
plt.plot(data['Date'], data['Weekly_Sales'], label='Weekly Sales')
plt.title('Weekly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Weekly Sales')
plt.legend()
plt.show()



In [None]:
# Distribution of Weekly Sales
plt.figure(figsize=(8, 5))
sns.histplot(walmart_data['Weekly_Sales'], bins=30, kde=True)
plt.title('Distribution of Weekly Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()




In [None]:
#Boxplots of Sales by Store
plt.figure(figsize=(12, 5))
sns.boxplot(x='Store', y='Weekly_Sales', data=walmart_data)
plt.title('Sales Distribution by Store')
plt.xlabel('Store')
plt.ylabel('Weekly Sales')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Seasonal Decomposition
result = sm.tsa.seasonal_decompose(data['Weekly_Sales'], model='additive', period=52)
result.plot()
plt.show()



In [None]:
# Residual Analysis
plt.figure(figsize=(10, 6))
plt.scatter(data.index, data['Weekly_Sales'] - data['Weekly_Sales'].mean())
plt.title("Residual Analysis")
plt.xlabel("Index")
plt.ylabel("Residuals")
plt.show()



In [None]:
# Step 4: Predictive Analysis
# Define features and target
X = data[['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'WeekOfYear', 'Lag_1', 'Lag_2']]
y = data['Weekly_Sales']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder


# Define features and target variable
X = data[['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'WeekOfYear']]
y = data['Weekly_Sales']

# 🔹 Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Identify Numerical and Categorical Features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# 🔹 Define Preprocessing Pipeline
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_features),
        ('cat', cat_transformer, categorical_features)
    ]
)

# Dictionary of models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}

# Training & Evaluation
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    
    print(f"{name}:")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R² Score: {r2:.2f}\n")



In [None]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# 1. Check data
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("Missing values in X_train:", X_train.isnull().sum().sum())
print("Missing values in y_train:", y_train.isnull().sum())

# 2. Check preprocessor
print("Preprocessor:", preprocessor)

# 3. Check if GridSearchCV runs a simple model
test_model = RandomForestRegressor(n_estimators=100, random_state=42)
test_model.fit(X_train, y_train)
print("Simple model test score:", test_model.score(X_test, y_test))  



In [None]:
from sklearn.pipeline import Pipeline

# 🔹 Define Model Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# 🔹 Define Hyperparameter Grid
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10]
}

# 🔹 Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)

# 🔹 Fit Model
grid_search.fit(X_train, y_train)

# 🔹 Print Best Parameters and Best Score
print("Best Parameters:", grid_search.best_params_)
print("Best R^2 Score:", grid_search.best_score_)


In [None]:

# Function to Evaluate Model Performance
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{model_name} Performance:")
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("MAPE:", mean_absolute_percentage_error(y_true, y_pred))
    print("R^2 Score:", r2_score(y_true, y_pred))
    

In [None]:
# Model Evaluation
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("\nBest Model Parameters:")
print(grid_search.best_params_)
print("\nModel Performance:")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-Squared Score: {r2}")


In [None]:
# Base Models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    'Decision Tree': DecisionTreeRegressor(max_depth=20, random_state=42),
    'Support Vector Regression': SVR()
}




In [None]:
# Train and Evaluate Models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    evaluate_model(y_test, y_pred, name)
    
    

In [None]:
# Step 4: Hyperparameter Tuning (Random Forest)
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30]
}

grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5, scoring='r2', n_jobs=-1)
grid_search_rf.fit(X_train_scaled, y_train)
print("\nBest Parameters for Random Forest:", grid_search_rf.best_params_)
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test_scaled)
evaluate_model(y_test, y_pred_rf, "Tuned Random Forest")

In [None]:
# Step 6: Residual Analysis 
residuals = y_test - predictions
plt.figure(figsize=(8, 5))
sns.histplot(residuals, bins=30, kde=True)
plt.axvline(x=0, color='red', linestyle='--')
plt.title("Residual Distribution (Prediction Errors)")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Get best model from GridSearchCV
best_model = grid_search.best_estimator_
model = best_model.named_steps['model']

# Check if the model has feature importances
if hasattr(model, 'feature_importances_'):
    # Get feature names from preprocessor
    feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
    feature_importances = model.feature_importances_

    # Create and sort DataFrame
    feature_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

    # Plot feature importance
    sns.barplot(x='Importance', y='Feature', data=feature_imp_df)
    plt.title('Feature Importance')
    plt.show()

    # Print top 5 key drivers
    print("\nKey Drivers of Sales:")
    print(feature_imp_df.head(5).to_string(index=False))
else:
    print("Error: Model does not have 'feature_importances_' attribute.")


In [None]:
# Sales Trend Visualization 
plt.figure(figsize=(12, 6))
sns.lineplot(x=walmart_data['Date'], y=walmart_data['Weekly_Sales'])
plt.title('Weekly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()


In [None]:
# Boxplot of Sales Per Store
plt.figure(figsize=(10, 5))
sns.boxplot(x=walmart_data['Store'], y=walmart_data['Weekly_Sales'])
plt.xticks(rotation=90)
plt.title('Sales Distribution Per Store')
plt.show()


In [None]:

# Step 7: Model Comparison Table
results_df = pd.DataFrame(columns=['Model', 'RMSE', 'MAE', 'MAPE', 'R2 Score'])
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    
    # Create a temporary DataFrame for the model's results
    model_results = pd.DataFrame({
        'Model': [name],
        'RMSE': [np.sqrt(mean_squared_error(y_test, y_pred))],
        'MAE': [mean_absolute_error(y_test, y_pred)],
        'MAPE': [mean_absolute_percentage_error(y_test, y_pred)],
        'R2 Score': [r2_score(y_test, y_pred)]
    })
    
    # Concatenate the results
    results_df = pd.concat([results_df, model_results], ignore_index=True)

print("\nModel Performance Comparison:\n", results_df.sort_values(by="R2 Score", ascending=False))


In [None]:
# Step 8: Stacking Regressor
stacked_model = StackingRegressor(
    estimators=[
        ('rf', RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)),
        ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42)),
        ('xgb', XGBRegressor(n_estimators=100, random_state=42))
    ],
    final_estimator=LinearRegression()
)

stacked_model.fit(X_train_scaled, y_train)
y_pred_stacked = stacked_model.predict(X_test_scaled)
evaluate_model(y_test, y_pred_stacked, "Stacking Regressor")


In [None]:
# Residual analysis for the Stacking Regressor
plt.figure(figsize=(8,5))
plt.scatter(y_test, y_test - y_pred_stacked, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Actual Weekly Sales")
plt.ylabel("Residuals (Error)")
plt.title("Residual Plot for Stacking Regressor")
plt.show()


In [None]:
# Step 9: Enhanced Conclusions
print("\nEnhanced Conclusions:")
print("1. Stacking Regressor outperformed other models with the best R² score.")
print("2. 'Holiday_Flag' and 'Lag Features' were the most impactful predictors.")
print("3. Future work: Explore deep learning methods like LSTM for better forecasting.")

In [None]:
#Conclusion/Discussion
print("\nConclusions:")
print("1. Various models provide a comparative perspective on sales prediction accuracy.")
print("2. Feature engineering enhances model performance by capturing temporal patterns.")
print("3. Residual and seasonal analysis uncovers patterns and anomalies.")
print("4. Future work: Explore advanced forecasting techniques and model interpretability improvements.")


In [None]:
!pip install mlxtend
