In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
file_path = "/home/reeyadav/DS_Project/Walmart_Sales/Walmart_Store_sales.csv"
walmart_data = pd.read_csv(file_path)

In [5]:
walmart_data.head(), walmart_data.info(), walmart_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         20 non-null     int64  
 1   Date          20 non-null     object 
 2   Weekly_Sales  20 non-null     float64
 3   Holiday_Flag  20 non-null     int64  
 4   Temperature   20 non-null     float64
 5   Fuel_Price    20 non-null     float64
 6   CPI           20 non-null     float64
 7   Unemployment  20 non-null     float64
dtypes: float64(5), int64(2), object(1)
memory usage: 1.4+ KB


(   Store        Date  Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price  \
 0      1      5/2/10    1643690.90             0        42.31       2.572   
 1      1     12/2/10    1641957.44             1        38.51       2.548   
 2      1  19-02-2010    1611968.17             0        39.93       2.514   
 3      1  26-02-2010    1409727.59             0        46.63       2.561   
 4      1      5/3/10    1554806.68             0        46.50       2.625   
 
           CPI  Unemployment  
 0  211.096358         8.106  
 1  211.242170         8.106  
 2  211.289143         8.106  
 3  211.319643         8.106  
 4  211.350143         8.106  ,
 None,
        Store  Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price         CPI  \
 count   20.0  2.000000e+01     20.000000    20.000000   20.000000   20.000000   
 mean     1.0  1.509637e+06      0.050000    62.692000    2.704750  210.947500   
 std      0.0  8.781993e+04      0.223607    14.778244    0.102536    0.404775   
 min    

In [11]:

walmart_data['Date'] = pd.to_datetime(walmart_data['Date'], errors='coerce')
failed_dates = walmart_data[walmart_data['Date'].isna()]
if failed_dates.empty:
    walmart_data = walmart_data.sort_values('Date')
    cleaned_data = walmart_data
else:
    cleaned_data = failed_dates

cleaned_data


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Semester
8,1,2010-02-04,1594968.28,0,62.27,2.719,210.82045,7.808,2010,2,First
2,1,2010-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,2010,2,First
3,1,2010-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,2010,2,First
6,1,2010-03-19,1472515.79,0,54.58,2.72,211.215635,8.106,2010,3,First
7,1,2010-03-26,1404429.92,0,51.45,2.732,211.018042,8.106,2010,3,First
17,1,2010-04-06,1615524.71,0,80.69,2.705,211.176428,7.808,2010,4,First
10,1,2010-04-16,1466058.28,0,66.32,2.808,210.4887,7.808,2010,4,First
11,1,2010-04-23,1391256.12,0,64.84,2.795,210.439123,7.808,2010,4,First
12,1,2010-04-30,1425100.71,0,67.41,2.78,210.389546,7.808,2010,4,First
0,1,2010-05-02,1643690.9,0,42.31,2.572,211.096358,8.106,2010,5,First


In [13]:
walmart_data['Days'] = (walmart_data['Date'] - walmart_data['Date'].min()).dt.days
X = walmart_data[['Days', 'CPI', 'Unemployment', 'Fuel_Price']]
y = walmart_data['Weekly_Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Build a linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the testing set
y_pred = model.predict(X_test)

# Calculate the performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

coefficients = model.coef_, model.intercept_
mse, r2, coefficients


(4329254396.830123,
 0.5494973142157158,
 (array([ 2.27896106e+02, -9.26032175e+04, -1.68064549e+05, -7.62237551e+05]),
  24417775.269557353))

In [14]:
# Average sales on non-holiday weeks
average_non_holiday_sales = walmart_data[walmart_data['Holiday_Flag'] == 0]['Weekly_Sales'].mean()

# Identify holidays with sales higher than the average non-holiday sales
holiday_sales_comparison = walmart_data[walmart_data['Holiday_Flag'] == 1]
holiday_sales_comparison['Higher_Than_Non_Holiday_Avg'] = holiday_sales_comparison['Weekly_Sales'] > average_non_holiday_sales

# Display holidays with higher sales than non-holiday average
holiday_higher_sales = holiday_sales_comparison[holiday_sales_comparison['Higher_Than_Non_Holiday_Avg']]
holiday_higher_sales, average_non_holiday_sales


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holiday_sales_comparison['Higher_Than_Non_Holiday_Avg'] = holiday_sales_comparison['Weekly_Sales'] > average_non_holiday_sales


(   Store       Date  Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price  \
 1      1 2010-12-02    1641957.44             1        38.51       2.548   
 
          CPI  Unemployment  Year  Month Semester  Days  \
 1  211.24217         8.106  2010     12   Second   301   
 
    Higher_Than_Non_Holiday_Avg  
 1                         True  ,
 1502673.2136842106)

In [15]:
# Calculate mean sales for non-holiday weeks
walmart_data['Month'] = walmart_data['Date'].dt.month
walmart_data['Year'] = walmart_data['Date'].dt.year

monthly_sales = walmart_data.groupby(['Year', 'Month'])['Weekly_Sales'].sum().reset_index()

walmart_data['Semester'] = walmart_data['Month'].apply(lambda x: 1 if x <= 6 else 2)
semester_sales = walmart_data.groupby(['Year', 'Semester'])['Weekly_Sales'].sum().reset_index()

monthly_sales, semester_sales

(   Year  Month  Weekly_Sales
 0  2010      2    4616664.04
 1  2010      3    2876945.71
 2  2010      4    5897939.82
 3  2010      5    7524481.10
 4  2010      6    1503284.06
 5  2010      7    1603955.12
 6  2010      9    1545418.53
 7  2010     11    1542561.09
 8  2010     12    3081499.03,
    Year  Semester  Weekly_Sales
 0  2010         1   22419314.73
 1  2010         2    7773433.77)

(4022303649.8111906,
 0.5814386425046635,
 (array([   2418.06455055,  -83340.70879199, -181520.77374456,
         -749019.84204054]),
  22532947.484549213))