In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import datetime as dt

# Load data
data = pd.read_csv('Walmart_Store_sales.csv')

# Convert 'Date' column to datetime with the correct format
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')

# total sales per store
total_sales_per_store = data.groupby('Store')['Weekly_Sales'].sum()

#store with maximum sales
max_sales_store = total_sales_per_store.idxmax()
max_sales_value = total_sales_per_store.max()

print(f"Store with maximum sales: Store {max_sales_store} with sales ${max_sales_value}")

Store with maximum sales: Store 20 with sales $301397792.46


In [18]:
# standard deviation and mean sales per store
std_sales_per_store = data.groupby('Store')['Weekly_Sales'].std()
mean_sales_per_store = data.groupby('Store')['Weekly_Sales'].mean()

#store with maximum standard deviation
max_std_store = std_sales_per_store.idxmax()
max_std_value = std_sales_per_store.max()

# coefficient of variation (CV)
cv_sales_per_store = std_sales_per_store / mean_sales_per_store

print(f"Store with maximum standard deviation: Store {max_std_store} with standard deviation ${max_std_value}")
print(f"Coefficient of variation for Store {max_std_store}: {cv_sales_per_store[max_std_store]}")

Store with maximum standard deviation: Store 14 with standard deviation $317569.9494755081
Coefficient of variation for Store 14: 0.15713673600948333


In [20]:
# Filter data for Q3 2012
q3_2012 = data[(data['Date'] >= '2012-07-01') & (data['Date'] <= '2012-09-30')]

#total sales per store for Q3 2012 and previous quarter (Q2 2012)
q2_2012 = data[(data['Date'] >= '2012-04-01') & (data['Date'] <= '2012-06-30')]

total_sales_q3_2012 = q3_2012.groupby('Store')['Weekly_Sales'].sum()
total_sales_q2_2012 = q2_2012.groupby('Store')['Weekly_Sales'].sum()

# quarterly growth rate
quarterly_growth_rate = (total_sales_q3_2012 - total_sales_q2_2012) / total_sales_q2_2012

#stores with positive growth rate
good_growth_stores = quarterly_growth_rate[quarterly_growth_rate > 0].sort_values(ascending=False)

print("Stores with good quarterly growth rate in Q3 2012:")
print(good_growth_stores)



Stores with good quarterly growth rate in Q3 2012:
Store
7     0.133308
16    0.084884
35    0.044666
26    0.039555
39    0.024784
41    0.024570
44    0.024346
24    0.016521
40    0.011428
23    0.008254
Name: Weekly_Sales, dtype: float64


In [21]:
# mean sales for non-holiday weeks
non_holiday_sales_mean = data[data['Holiday_Flag'] == 0]['Weekly_Sales'].mean()

# Filter data for holiday weeks
holiday_sales = data[data['Holiday_Flag'] == 1]

# mean sales for each holiday
holiday_mean_sales = holiday_sales.groupby('Date')['Weekly_Sales'].mean()

# holidays with higher sales than non-holiday mean sales
higher_holiday_sales = holiday_mean_sales[holiday_mean_sales > non_holiday_sales_mean]

print("Holidays with higher sales than the mean sales in non-holiday season:")
print(higher_holiday_sales)

Holidays with higher sales than the mean sales in non-holiday season:
Date
2010-02-12    1.074148e+06
2010-11-26    1.462689e+06
2011-02-11    1.051915e+06
2011-11-25    1.479858e+06
2012-02-10    1.111320e+06
2012-09-07    1.074001e+06
Name: Weekly_Sales, dtype: float64


In [22]:
# Extract month and semester from Date
data['Month'] = data['Date'].dt.month
data['Semester'] = data['Date'].dt.to_period('6M')

# monthly sales
monthly_sales = data.groupby('Month')['Weekly_Sales'].sum()

#semester sales
semester_sales = data.groupby('Semester')['Weekly_Sales'].sum()

print("Monthly sales:")
print(monthly_sales)
print("\nSemester sales:")
print(semester_sales)


Monthly sales:
Month
1     3.325984e+08
2     5.687279e+08
3     5.927859e+08
4     6.468598e+08
5     5.571256e+08
6     6.226299e+08
7     6.500010e+08
8     6.130902e+08
9     5.787612e+08
10    5.847848e+08
11    4.130157e+08
12    5.768386e+08
Name: Weekly_Sales, dtype: float64

Semester sales:
Semester
2010-02    1.903330e+08
2010-03    1.819198e+08
2010-04    2.314124e+08
2010-05    1.867109e+08
2010-06    1.922462e+08
2010-07    2.325801e+08
2010-08    1.876401e+08
2010-09    1.772679e+08
2010-10    2.171618e+08
2010-11    2.028534e+08
2010-12    2.887605e+08
2011-01    1.637040e+08
2011-02    1.863313e+08
2011-03    1.793564e+08
2011-04    2.265265e+08
2011-05    1.816482e+08
2011-06    1.897734e+08
2011-07    2.299114e+08
2011-08    1.885993e+08
2011-09    2.208477e+08
2011-10    1.832613e+08
2011-11    2.101624e+08
2011-12    2.880781e+08
2012-01    1.688945e+08
2012-02    1.920636e+08
2012-03    2.315097e+08
2012-04    1.889209e+08
2012-05    1.887665e+08
2012-06    2.40610

In [25]:
# Filter data for Store 1
store1_data = data[data['Store'] == 1].copy()

# Create a new variable for day numbers starting from the earliest date
store1_data.loc[:, 'Day_Number'] = (store1_data['Date'] - store1_data['Date'].min()).dt.days

# Features and target variable
X = store1_data[['Day_Number', 'CPI', 'Unemployment', 'Fuel_Price']]
y = store1_data['Weekly_Sales']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 23950342320.389275
R-squared: 0.009792471764679345
