# Sales forecasting

The project is mainly to predict the future sales by using the time-series forecasting technique. 

# Import Dependencies 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import scipy.stats as stats
from scipy.stats import pearsonr
import itertools
from statsmodels.tsa.stattools import kpss
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import adfuller

# Datasets 

In [None]:
train_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
oil_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
transaction_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
stores_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
holiday_event_df = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

# Understanding the Data

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
oil_df.head()

In [None]:
transaction_df.head()

In [None]:
stores_df.head()

In [None]:
holiday_event_df.head()

In [None]:
#The sales column is the target variable. 


##  Merging datasets holiday_event_df, stores_df, oil_df & train_df

In [None]:
train_df = train_df.merge(stores_df, on ='store_nbr')
train_df = train_df.merge(oil_df, on ='date', how='left')
holiday_event_df = holiday_event_df.rename(columns={'type': 'holiday_type'})
train_df = train_df.merge(holiday_event_df, on='date', how='left')

In [None]:
train_df.head(3)

In [None]:
train_df.info()

# Missing Values Detection

In [None]:
train_df.isnull().sum()

In [None]:
#percentage of missing values in train_df 

missing_percentages = train_df.isnull().sum()/ len(train_df) * 100 

print(missing_percentages)

In [None]:
# remove columns that are having more than 30% missing values

columns_to_delete = missing_percentages[missing_percentages > 30].index

train_df = train_df.drop(columns=columns_to_delete)

In [None]:
train_df.info()

# Duplicates


In [None]:
train_df.duplicated().any()

In [None]:
dupes=train_df.duplicated()

#dupes
sum(dupes)

In [None]:
#dropping duplicate values

train_df = train_df.drop_duplicates()
train_df

In [None]:
train_df.duplicated().any()

In [None]:
test_df.duplicated().any()

# Check if there still any missing values present in the train_df


In [None]:
train_df.isnull().sum()

# Calculate count, mean, std, min, 25%, 50%, 75%, max values for each column. Prepare an analysis of the difference between mean and median for each column and possible reasons for the same.

In [None]:
train_df.describe()

# EDA 

In [None]:
train_df.info()

## Questions

1. Does the type of stores affect the store sales? 

2. Which family is having the highest sales? 

3. Does promotion able to improve the sales? 

4. Which city is having the most number of customers? 

5. Which state is having the most number of customers? 

6. Which of the stores has the highest sales. 

7. Which month is having the most sales, and least sales. 



### 1. Does the type of stores affect the store sales?

To answer the first question 'Does the type of stores affect the store sales?' , i will use ANOVA test. 
ANOVA (Analysis of Variance) is a statistical test used to determine whether there are significant differences between the means of two or more groups. It compares the variation between the groups (due to the different categories or factors) to the variation within the groups.


H0 (>0.05)= The type of stores does not affect store sales. There is no significant difference in store sales between different types of stores.

H1 (<0.05)= The type of stores does affect store sales. There is a significant difference in store sales between different types of stores.



In [None]:
grouped_data = train_df.groupby('type')['sales']

# Perform the ANOVA test
f_statistic, p_value = stats.f_oneway(*[grouped_data.get_group(type) for type in grouped_data.groups])

# Print the results
print("F-Statistic:", f_statistic)
print("p-value:", p_value)


Based on the F-statistics and p-value above, we reject null hypothesis and accept alternative hypothesis. Hence, the type of stores does affect the store sales. There is a significant difference in store sales between different type. 

In [None]:
# Sales Vs Type

plt.scatter(train_df['type'], train_df['sales'])

plt.ylabel('sales')
plt.xlabel('type')

plt.show()
     

### 2. Which family is having the highest sales?

In [None]:
#Pie chart

# Group the data by family and calculate the total sales for each family
family_sales = train_df.groupby('family')['sales'].sum()

# Sort the families based on sales in descending order
family_sales_sorted = family_sales.sort_values(ascending=False)

# Get the top 5 families with the highest sales
top_families = family_sales_sorted.head(5)

# Create the pie chart
plt.pie(top_families, labels=top_families.index, autopct='%1.1f%%', startangle=90)

plt.title('Distribution of Sales by Family')

plt.axis('equal')  
plt.show()

Based on the pie chart above, the GROCERY I is having the highest sales, and Baverages comes second highest. 

### 3. Does promotion able to improve the sales?

To answer the 3rd question "Does promotion able to improve the sales?" I will use Pearson correlation test to determine the relationship between the two variables, as both of the variables are numericals. The Pearson correlation coefficient measures the linear relationship between two continuous variables and ranges from -1 to +1.

H0 (>0.05)= The promotion does not affect store sales. 

H1 (<0.05)= The promotion does affect store sales. 


In [None]:
correlation, p_value = pearsonr(train_df['onpromotion'], train_df['sales'])

print("Pearson correlation coefficient:", correlation)
print("p-value:", p_value)

Based on the Pearson correlation coefficient of 0.4279 and the p-value of 0.0, we can reject the null hypothesis (H0) and conclude that there is a significant relationship between promotion and store sales. Therefore, the promotion does affect store sales.

In [None]:
# Scatter plot
plt.scatter(train_df['onpromotion'], train_df['sales'])

plt.xlabel('Promotion')
plt.ylabel('Sales')
plt.title('Promotion vs Sales')

plt.show()


### 4. Which city is having the most most number of customers?

In [None]:
#Count Plot 

# Create a count plot
plt.figure(figsize=(10, 6))  # Set the figure size
sns.countplot(data=train_df, x='city')

plt.xlabel('City')
plt.ylabel('Count')
plt.title('Sales Distribution by City')

plt.xticks(rotation=45)

plt.show()


Based on the count plot above, the Quito city has the most sales. 

### 5. Which state is having the most number of customers?


In [None]:
#Count Plot 

# Create a count plot
plt.figure(figsize=(10, 6))  # Set the figure size
sns.countplot(data=train_df, x='state')

plt.xlabel('state')
plt.ylabel('Count')
plt.title('Sales Distribution by City')

plt.xticks(rotation=45)

plt.show()

Based on the count plot above, Pichincha state has the most sales as compared to other states. 

### 6. Which of the stores has the highest sales. 

In [None]:
# Calculate the total sales for each store
store_sales = train_df.groupby('store_nbr')['sales'].sum().reset_index()

# Sort the stores based on sales in descending order
store_sales = store_sales.sort_values('sales', ascending=False)

# Create a bar plot
plt.figure(figsize=(12, 6))
sns.barplot(data=store_sales, x='store_nbr', y='sales')

plt.xlabel('Store Number')
plt.ylabel('Total Sales')
plt.title('Total Sales by Store')

plt.xticks(rotation=45)

plt.show()



### 7. Which month is having the most sales, and least sales. 

In [None]:
#First convert the 'date' from object to date time 

train_df['date']= pd.to_datetime(train_df['date'])


# create new columns 'month' 'year'
train_df['month'] = train_df['date'].dt.month
train_df['year'] = train_df['date'].dt.year

In [None]:
train_df.head(7)

In [None]:
# Group the data by month, year, and calculate the total sales
monthly_sales = train_df.groupby(['month', 'year'])['sales'].sum().reset_index()

# Create the line chart
plt.figure(figsize=(10, 6))  # Set the figure size

# Get unique years and cycle through colors
years = monthly_sales['year'].unique()
colors = itertools.cycle(['red', 'green', 'blue', 'orange', 'purple'])

for year in years:
    year_data = monthly_sales[monthly_sales['year'] == year]
    plt.plot(year_data['month'], year_data['sales'], marker='o', color=next(colors), label=str(year))

plt.xlabel('Month')
plt.ylabel('Sales')
plt.title('Monthly Sales Trend')

# Customize x-axis ticks to show month names
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
plt.xticks(range(1, 13), month_names)


plt.legend()

plt.show()


Overall, the orange line which is 2016 has a stable high sales since January to Dec. Between the months in 2016, December had the most sales. In other hand, in comparing to other years, 2013 had an overall lowest sales achieved, especially during February. 

In [None]:
train_df = train_df.groupby('date')['sales','onpromotion'].sum().reset_index()
print(train_df)

# Autocorrelation 

Autocorrelation measures the correlation between a time series and its lagged values. Autocorrelation plots (ACF) and partial autocorrelation plots (PACF) help identify significant lag values and potential autoregressive or moving average components.

- If the autocorrelation value is close to 1 or -1, it indicates a strong positive or negative autocorrelation, respectively.

- If the autocorrelation value is close to 0, it indicates a weak or no autocorrelation.

In [None]:
sales_series = train_df['sales']
autocorr_values = sales_series.autocorr()
print("Autocorrelation:", autocorr_values)

Based on the result above, since the autocorrelation value is close to 1 (0.766), it suggests that there is a positive autocorrelation. A positive autocorrelation indicates that there is a relationship between the current sales values and the previous sales values.

In [None]:
plot_acf(train_df['sales'])

In [None]:
# Plot the PACF
fig, ax = plt.subplots(figsize=(10, 6))
plot_pacf(train_df['sales'], ax=ax)
plt.xlabel('Lag')
plt.ylabel('Partial Autocorrelation')
plt.title('Partial Autocorrelation Function (PACF)')

plt.show()

# Differencing technique 

This process is meant to transform the time series data to stationary, as ARIMA model only works with stationary time series data. 

In [None]:
train_df['diff_sales'] = train_df['sales'].diff()

In [None]:
train_df = train_df.dropna()

In [None]:
print(train_df['diff_sales'])

In [None]:
train_df['diff_sales'] = train_df['sales'] - train_df['sales'].shift(1)

In [None]:
# Drop the first row since differencing introduces a NaN value
train_df = train_df.dropna()

In [None]:
print(train_df['diff_sales'])

In [None]:
train_df.head()

In [None]:
# Compute the autocorrelation
autocorrelation = sm.tsa.acf(train_df['diff_sales'], nlags=20)

# Plot the autocorrelation chart
plt.figure(figsize=(10, 6))
plt.stem(range(len(autocorrelation)), autocorrelation, use_line_collection=True)
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation Chart')
plt.show()

# Stationarity Test 


There are various statistical tests to check stationarity, including the Augmented Dickey-Fuller (ADF) test and the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test.

### Augmented Dickey-Fuller (ADF)  test 

The Augmented Dickey-Fuller (ADF) test is a statistical test used to determine whether a time series is stationary or non-stationary. Stationarity is an important assumption in many time series analysis models.

The ADF test evaluates the null hypothesis that the time series has a unit root, indicating non-stationarity. The alternative hypothesis is that the time series is stationary.

When performing the ADF test, we obtain the ADF statistic and the p-value. The ADF statistic is a negative number and the more negative it is, the stronger the evidence against the null hypothesis. The p-value represents the probability of observing the ADF statistic or a more extreme value if the null hypothesis were true. A low p-value (below a chosen significance level, typically 0.05) indicates strong evidence against the null hypothesis and suggests that the time series is stationary.

In [None]:
ts = train_df['diff_sales']

In [None]:
# Perform the ADF test
result = adfuller(ts)

# Extract and print the test statistics and p-value
adf_statistic = result[0]
p_value = result[1]
print("ADF Statistic:", adf_statistic)
print("p-value:", p_value)

The ADF statistic is -11.494679187188824. This statistic is a negative value and is more negative than the critical values at common significance levels. This suggests strong evidence against the null hypothesis of a unit root, indicating that the time series is stationary.

The p-value is 4.645171054101398e-21, which is a very small value close to zero. Typically, if the p-value is below a chosen significance level (e.g., 0.05), it indicates strong evidence to reject the null hypothesis. In your case, the extremely small p-value suggests strong evidence against the presence of a unit root and supports the stationarity of the time series.

### Kwiatkowski-Phillips-Schmidt-Shin (KPSS)

The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test is another statistical test used to assess the stationarity of a time series. It is complementary to the Augmented Dickey-Fuller (ADF) test.

The KPSS test evaluates the null hypothesis that the time series is stationary against the alternative hypothesis of non-stationarity. Unlike the ADF test, which assumes the presence of a unit root, the KPSS test assumes the absence of a unit root.

The test calculates the KPSS statistic, which measures the cumulative sum of squared deviations from the mean in the series. It also provides a p-value that indicates the probability of observing the KPSS statistic or a more extreme value under the null hypothesis.

Interpreting the results of the KPSS test involves considering the KPSS statistic and the associated p-value. If the KPSS statistic is greater than the critical value at a chosen significance level (e.g., 0.05), it provides evidence against the null hypothesis of stationarity. Conversely, if the KPSS statistic is smaller than the critical value, it suggests that the time series is stationary.

In [None]:
result = kpss(ts)

# Extract and print the test statistic and p-value
kpss_statistic = result[0]
p_value = result[1]
print("KPSS Statistic:", kpss_statistic)
print("p-value:", p_value)

The KPSS statistic is 0.02685487746003539. This statistic measures the discrepancy between the observed series and the series' trend. It indicates how far the series deviates from stationarity. A smaller KPSS statistic suggests a closer fit to stationarity.

The p-value is 0.1, which is equal to the chosen significance level of 0.1. Typically, if the p-value is greater than the significance level, it suggests that there is insufficient evidence to reject the null hypothesis of stationarity. Based on the result, the p-value is equal to the significance level, indicating that the results are inconclusive.

# Final ACF & PACF 

In [None]:
# Plot the Autocorrelation Function (ACF)
plt.figure(figsize=(10, 4))
ax1 = plt.subplot(121)
plot_acf(train_df['diff_sales'], ax=ax1)

# Plot the Partial Autocorrelation Function (PACF)
ax2 = plt.subplot(122)
plot_pacf(train_df['diff_sales'], ax=ax2)

plt.tight_layout()
plt.show()

# Autoregressive Integrated Moving Average Model (ARIMA) model

In [None]:
p = 2

d = 1 

q = 1

In [None]:
train_np = train_df['diff_sales'].values.astype('float64')
model = sm.tsa.ARIMA(train_np, order=(p, d, q))

result = model.fit()

In [None]:
# Print the model summary
print(result.summary())

In [None]:
# Make predictions
start_idx = len(train_np)
end_idx = len(train_np) + len(test_df) - 1
predictions = result.predict(start=start_idx, end=end_idx)

# Print the predictions
print(predictions)

In [None]:
actual_values = train_df['diff_sales']

In [None]:
# Truncate or pad the predictions array to match the length of actual_values
predictions = predictions[:len(actual_values)]

# Calculate evaluation metrics
mae = np.mean(np.abs(predictions - actual_values))
mse = np.mean((predictions - actual_values) ** 2)
rmse = np.sqrt(mse)

# Print the evaluation metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)

# Submission 



In [None]:
submission = pd.DataFrame()
submission['id'] = test_df['id'] 
submission['sales'] = np.zeros(len(test_df))

# save the submission file as a CSV file
submission.to_csv('mysubmission.csv', index=False)