# 🛒 Supermart Grocery Sales - Retail Analytics Project
This project explores sales data from a fictional grocery delivery app. The focus is on data cleaning, exploratory analysis, and predictive modeling using linear regression.


In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Step 2: Load Dataset
df = pd.read_csv('Supermart Grocery Sales - Retail Analytics Dataset.csv')
df.head()

In [None]:
# Step 3: Data Cleaning
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Order Day'] = df['Order Date'].dt.day
df['Order Month'] = df['Order Date'].dt.month
df['Order Year'] = df['Order Date'].dt.year
df['month_no'] = df['Order Date'].dt.month
df['Month'] = df['Order Date'].dt.strftime('%B')
df['year'] = df['Order Date'].dt.year

In [None]:
# Step 4: Label Encoding
le = LabelEncoder()
for col in ['Category', 'Sub Category', 'City', 'Region', 'State', 'Month']:
    df[col] = le.fit_transform(df[col])
df.head()

In [None]:
# Step 5: Exploratory Data Analysis (EDA)
plt.figure(figsize=(10,6))
df.groupby("Category")["Sales"].sum().plot(kind='bar')
plt.title('Sales by Category')
plt.ylabel('Sales')
plt.show()

plt.figure(figsize=(10,6))
df.groupby('month_no')['Sales'].sum().plot(marker='o')
plt.title('Monthly Sales Trend')
plt.ylabel('Sales')
plt.show()

plt.figure(figsize=(12,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Step 6: Model Building
features = df.drop(columns=['Order ID', 'Customer Name', 'Order Date', 'Sales'])
target = df['Sales']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Step 7: Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

In [None]:
# Step 8: Visualization of Predictions
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual vs Predicted Sales')
plt.show()