In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Take Walmart Dataset as Input
train = pd.read_csv('train.csv', parse_dates=['Date'])
features = pd.read_csv('features.csv', parse_dates=['Date'])
stores = pd.read_csv('stores.csv')

In [None]:
train.head()

In [None]:
features.head()

In [None]:
stores.head()

In [None]:
# Merge data os train,feature and stores
train_data = pd.merge(train, features, on=['Store', 'Date', 'IsHoliday'], how='left')
train_data = pd.merge(train_data, stores, on='Store', how='left')
print(train_data.head())

In [None]:
#Convert to numerical data
train_data = train_data.select_dtypes(include=['number'])
train_data.fillna(train_data.mean(), inplace=True)

In [None]:
# Data Analysis
plt.figure(figsize=(10, 6))
sns.histplot(train_data['Weekly_Sales'], bins=50, kde=True)
plt.title('Distribution of Weekly Sales')
plt.show()

In [None]:
plt.figure(figsize=(14, 7))
sns.boxplot(x='Dept', y='Weekly_Sales', data=train_data)
plt.xticks(rotation=90)
plt.title('Weekly Sales by Department')
plt.show()

In [None]:
X = train_data.drop(columns=['Weekly_Sales'])
y = train_data['Weekly_Sales']

In [None]:
#Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Use RandomForestRegressor model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2}')
print(f'RMSE: {rmse}')

In [None]:
y_pred

In [None]:
y_test

In [None]:
residuals = y_test - y_pred

# Plot residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, bins=50, kde=True)
plt.title('Distribution of Residuals')
plt.show()

# Residuals vs Predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Predictions')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()

In [None]:
errors = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
errors['Error'] = errors['Actual'] - errors['Predicted']

# Plotting the actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(errors['Actual'].values, label='Actual')
plt.plot(errors['Predicted'].values, label='Predicted', alpha=0.7)
plt.title('Actual vs Predicted Sales')
plt.xlabel('Samples')
plt.ylabel('Weekly Sales')
plt.legend()
plt.show()

In [None]:
# Scatter plot of actual vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(errors['Actual'], errors['Predicted'])
plt.plot([errors['Actual'].min(), errors['Actual'].max()], [errors['Actual'].min(), errors['Actual'].max()], 'k--', lw=3)
plt.title('Actual vs Predicted Sales')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.show()