# Walmart Sales Top 10 Stores Analysis and Prediction

### 2.1)Importing Libraries

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder , FunctionTransformer , LabelEncoder , PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score ,mean_absolute_percentage_error , accuracy_score
from sklearn.tree import DecisionTreeRegressor ,plot_tree ,DecisionTreeClassifier
from sklearn.linear_model import LinearRegression , LogisticRegression
from sklearn.neighbors import KNeighborsRegressor , KNeighborsClassifier
from statsmodels.tsa.seasonal import seasonal_decompose


### 2.2) Loading the Dataset

In [50]:
df= pd.read_csv("Walmart.csv")
df

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


In [9]:
# df["Date"]=pd.to_datetime(df["Date"], format='%d-%m-%Y')

### 3.4.1) Removing Outliers of Weekly Sales

In [51]:
print(df["Weekly_Sales"].min())
print(df["Weekly_Sales"].max())
print(df["Weekly_Sales"].std())
print(df["Weekly_Sales"].mean())

percentile25=df["Weekly_Sales"].quantile(0.25)
percentile75=df["Weekly_Sales"].quantile(0.75)

iqr = percentile75 - percentile25

upper_limit_weekly_sales=percentile75 +1.5 * iqr
lower_limit_weekly_sales=percentile75 -1.5 * iqr
print("Upper Limit:",upper_limit_weekly_sales)
print("Lower Limit:",lower_limit_weekly_sales)

print("Weekly Sales Above Upper Limit Data Points",df[df["Weekly_Sales"]>upper_limit_weekly_sales].shape)

print("Weekly Sales Below Lower Limit Data Points",df[df["Weekly_Sales"]<lower_limit_weekly_sales].shape)

df = df[df["Weekly_Sales"] < upper_limit_weekly_sales]



209986.25
3818686.45
564366.6220536975
1046964.8775617715
Upper Limit: 2720371.4924999997
Lower Limit: 119945.8274999999
Weekly Sales Above Upper Limit Data Points (34, 8)
Weekly Sales Below Lower Limit Data Points (0, 8)


In [52]:
dff = df.copy()
dff["Date"]=pd.to_datetime(dff["Date"], format='%d-%m-%Y')

# 1) Classification

In [53]:
mean_sales = df["Weekly_Sales"].mean()
min_sales = df["Weekly_Sales"].min()

# Create a new column for sales categories
conditions = [
    (df['Weekly_Sales'] > mean_sales),
    (df['Weekly_Sales'] <= mean_sales) & (df['Weekly_Sales'] > min_sales),
    (df['Weekly_Sales'] <= min_sales)
]
labels = ['High', 'Medium', 'Low']
df['Sales_Category'] = np.select(conditions, labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sales_Category'] = np.select(conditions, labels)


In [54]:
df["Sales_Category"]

0         High
1         High
2         High
3         High
4         High
         ...  
6430    Medium
6431    Medium
6432    Medium
6433    Medium
6434    Medium
Name: Sales_Category, Length: 6401, dtype: object

In [55]:
label_encoder = LabelEncoder()
df['Sales_Category']= label_encoder.fit_transform(df['Sales_Category'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sales_Category']= label_encoder.fit_transform(df['Sales_Category'])


In [56]:
df["Sales_Category"]

0       0
1       0
2       0
3       0
4       0
       ..
6430    2
6431    2
6432    2
6433    2
6434    2
Name: Sales_Category, Length: 6401, dtype: int32

In [57]:
df.set_index('Date', inplace=True)

In [58]:
X = df[['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']]
y = df['Sales_Category']

In [59]:
X_train_class,X_test_class , y_train_class , y_test_class = train_test_split(X,y,test_size=0.2)

In [60]:
X_train_class.shape

(5120, 4)

In [61]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_class)
X_test_scaled = scaler.transform(X_test_class)

In [62]:
X_train_scaled.shape

(5120, 4)

In [63]:
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_scaled, y_train_class)
lr_pred = lr_model.predict(X_test_scaled)
lr_score = accuracy_score(y_test_class, lr_pred)
print(f'Logistic Regression Accuracy Score: {lr_score:.2f}')


Logistic Regression Accuracy Score: 0.56


In [65]:
knn = KNeighborsClassifier(n_neighbors=5)  # Example: using k=5

# Fit the classifier on the scaled training data
knn.fit(X_train_scaled, y_train_class)

# Predict on the scaled test data
knn_pred = knn.predict(X_test_scaled)

# Calculate accuracy score
knn_score = accuracy_score(y_test_class, knn_pred)
print(f'KNN Accuracy Score: {knn_score:.2f}')


KNN Accuracy Score: 0.60


In [66]:
dt_model = DecisionTreeClassifier(max_depth=3, min_samples_split=6, min_samples_leaf=2)
dt_model.fit(X_train_scaled, y_train_class)
dt_pred = dt_model.predict(X_test_scaled)
dt_score = accuracy_score(y_test_class, dt_pred)
print(f'Decision Tree Accuracy Score: {dt_score:.2f}')


Decision Tree Accuracy Score: 0.61


In [47]:
# from sklearn import tree

# plt.figure(figsize=(20, 15))  # Adjust width and height as needed
# tree.plot_tree(dt_model, filled=True, fontsize=10)  # Lower fontsize for larger trees
# plt.show()

# Top 10 Store

In [71]:
# Load the dataset
df = pd.read_csv("Walmart.csv")

# Identify top 10 stores with highest weekly sales
top_stores = df.groupby('Store')['Weekly_Sales'].sum().nlargest(10).index
print(top_stores)

# Prepare FunctionTransformer for log transformation
trf = FunctionTransformer(func=np.log1p, validate=True)

# Prepare a list to hold the predictions for all top stores
all_predictions = []

for store in top_stores:
    store_data = df[df['Store'] == store].copy()
    
    # Sort by date to maintain temporal order
    store_data.sort_values('Date', inplace=True)
    
    # Log transform the target variable
    y = trf.fit_transform(store_data[['Weekly_Sales']].values.reshape(-1, 1)).flatten()
    
    # Feature selection
    X = store_data[['Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize and train the models
    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train_scaled, y_train)
    y_pred_lr = lr_model.predict(X_test_scaled)
    
    # Decision Tree
    dt_model = DecisionTreeRegressor(max_depth=3, min_samples_split=6, min_samples_leaf=2)
    dt_model.fit(X_train_scaled, y_train)
    y_pred_dt = dt_model.predict(X_test_scaled)
    
    # Simple K-Nearest Neighbors
    knn = KNeighborsRegressor(n_neighbors=5)  # Fixed number of neighbors
    knn.fit(X_train_scaled, y_train)
    y_pred_knn = knn.predict(X_test_scaled)
    
    # Inverse log transformation
    y_test_inv = np.expm1(trf.inverse_transform(y_test.reshape(-1, 1)).flatten())
    y_pred_lr = np.expm1(y_pred_lr)
    y_pred_dt = np.expm1(y_pred_dt)
    y_pred_knn = np.expm1(y_pred_knn)
    
    # Store predictions and actuals in the list
    all_predictions.append(pd.DataFrame({
        'Date': store_data['Date'].iloc[len(X_train):],
        'Store': store,
        'Actual': y_test_inv,
        'LR_Prediction': y_pred_lr,
        'DT_Prediction': y_pred_dt,
        'KNN_Prediction': y_pred_knn
    }))

# Concatenate all predictions into a single DataFrame
final_predictions = pd.concat(all_predictions, ignore_index=True)

# Print or save the final predictions
final_predictions

Index([20, 4, 14, 13, 2, 10, 27, 6, 1, 39], dtype='int64', name='Store')


Unnamed: 0,Date,Store,Actual,LR_Prediction,DT_Prediction,KNN_Prediction
0,25-03-2011,20,1824711.21,2.132125e+06,2.053591e+06,2.002690e+06
1,25-05-2012,20,2114989.00,2.097992e+06,2.132873e+06,2.104274e+06
2,25-06-2010,20,1973135.87,2.006259e+06,2.053591e+06,2.107234e+06
3,25-11-2011,20,2906233.25,2.194307e+06,2.132873e+06,2.158175e+06
4,26-02-2010,20,1898193.95,2.189714e+06,2.053591e+06,2.123495e+06
...,...,...,...,...,...,...
285,30-07-2010,39,1308222.24,1.352549e+06,1.342706e+06,1.323767e+06
286,30-09-2011,39,1311775.83,1.527164e+06,1.372548e+06,1.386787e+06
287,30-12-2011,39,1537139.56,1.541620e+06,1.913352e+06,1.326276e+06
288,31-08-2012,39,1710923.94,1.559263e+06,1.639095e+06,1.625281e+06
