# Machine Learning Techniques for Sales Forecasting

## Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## Importing Datasets & Read all csv files

files available at: https://www.kaggle.com/datasets/ndarshan2797/english-converted-datasets

01. item_categories.csv - 
    item_category_name, 
    item_category_id

02. items.csv - 
    item_name, 
    item_id, 
    category_id

03. sales_train.csv - 
    date, 
    date_block_num, 
    shop_id, 
    item_id, 
    item_price, 
    item_cnt_day

04. shops.csv - 
    shop_name, 
    shop_id

05. test.csv - 
    ID, 
    shop_id, 
    item_id

In [None]:
#importing data

item_categories = pd.read_csv('./data-set/item_categories.csv')
items = pd.read_csv('./data-set/items.csv')
sales_train = pd.read_csv('./data-set/sales_train.csv')
shops = pd.read_csv('./data-set/shops.csv')
test = pd.read_csv('./data-set/test.csv')

In [None]:
#checking the shape of the data
print("Shape of item_categories:", item_categories.shape)
print("Shape of items:", items.shape)
print("Shape of sales_train:", sales_train.shape)
print("Shape of shops:", shops.shape)
print("Shape of test:", test.shape)

In [None]:
#checking the columns of the data
print("\n\nColumns of item_categories:\n")
print(item_categories.info())

print("-----------------------------------------------------")

print("\n\nColumns of items:\n")
print(items.info())

print("-----------------------------------------------------")

print("\n\nColumns of sales_train:\n")
print(sales_train.info())

print("-----------------------------------------------------")

print("\n\nColumns of shops:\n")
print(shops.info())

print("-----------------------------------------------------")

print("\n\nColumns of test:\n")
print(test.info())


In [None]:
#checking the head of the data

print("\n\nHead of item_categories:\n")
print(item_categories.head())

print("-----------------------------------------------------")

print("\n\nHead of items:\n")
print(items.head())

print("-----------------------------------------------------")

print("\n\nHead of sales_train:\n")
print(sales_train.head())

print("-----------------------------------------------------")

print("\n\nHead of shops:\n")
print(shops.head())

print("-----------------------------------------------------")

print("\n\nHead of test:\n")
print(test.head())

## Data Preprocessing & Feature Engineering

In [None]:
#merging the data

In [None]:
#Merge sales_train.csv with items.csv on the "item_id" column
sales_with_items = sales_train.merge(items, on='item_id', how='left')
print("\n\nHead of sales_with_items:\n")
print(sales_with_items.head(20))
print(sales_with_items.shape)

In [None]:
#Merge the result with item_categories.csv on the "category_id" 
sales_with_items_and_categories = sales_with_items.merge(item_categories, right_on='item_category_id', left_on='category_id', how='left')
print("\n\nHead of sales_with_items_and_categories:\n")
print(sales_with_items_and_categories.head(20))
print(sales_with_items_and_categories.shape)

In [None]:
# Check if the two columns are the same
if sales_with_items_and_categories['item_category_id'].equals(sales_with_items_and_categories['category_id']):
    # If they are the same, you can drop one of the columns
    sales_with_items_and_categories.drop(columns=['item_category_id'], inplace=True)


In [None]:
print("\n\nHead of sales_with_items_and_categories:\n")
print(sales_with_items_and_categories.head(20))
print(sales_with_items_and_categories.shape)

In [None]:
#Merge sales_train.csv with items.csv on the "item_id" column
sales_with_items = sales_train.merge(items, on='item_id', how='left')
print("\n\nHead of sales_with_items:\n")
print(sales_with_items.head(20))
print(sales_with_items.shape)

In [None]:
#Merge the result with item_categories.csv on the "category_id" 
sales_with_items_and_categories = sales_with_items.merge(item_categories, right_on='item_category_id', left_on='category_id', how='left')
print("\n\nHead of sales_with_items_and_categories:\n")
print(sales_with_items_and_categories.head(20))
print(sales_with_items_and_categories.shape)

In [None]:
# Check if the two columns are the same
if sales_with_items_and_categories['item_category_id'].equals(sales_with_items_and_categories['category_id']):
    # If they are the same, you can drop one of the columns
    sales_with_items_and_categories.drop(columns=['item_category_id'], inplace=True)


In [None]:
print("\n\nHead of sales_with_items_and_categories:\n")
print(sales_with_items_and_categories.head(20))
print(sales_with_items_and_categories.shape)

In [None]:
#Merge the result with shops.csv on the "shop_id" 
final_dataset = sales_with_items_and_categories.merge(shops, on='shop_id', how='left')
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#checks the columns of the final dataset
print("\n\nColumns of final_dataset:\n")
print(final_dataset.info())


In [None]:
#prints the date and date_block_num column to check whether they are related
columns_to_print = ['date', 'date_block_num']
print(final_dataset[columns_to_print])

In [None]:
#feature split

In [None]:
# Rename the column
final_dataset.rename(columns={'date_block_num': 'month_num'}, inplace=True)

In [None]:
#Rename the item_cnt_day column
final_dataset.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)

In [None]:
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#checks the columns of the final dataset
print("\n\nColumns of final_dataset:\n")
print(final_dataset.info())

In [None]:
#Data Cleaning

#checking for missing values
print("\n\nMissing values in final_dataset:\n")
print(final_dataset.isnull().sum())

In [None]:
#checking for null values
print("\n\nNull values in final_dataset:\n")
print(final_dataset.isnull().sum())

In [None]:
print(final_dataset.shape)

In [None]:
#handles the missing values in final_dataset
final_dataset['item_name'].fillna('Unknown', inplace=True)
final_dataset['item_category_name'].fillna('Unknown', inplace=True)

In [None]:
print(final_dataset.shape)

In [None]:
#removes duplicates rows in final_dataset
final_dataset.drop_duplicates(inplace=True)


In [None]:
print(final_dataset.shape)

In [None]:
#checks and solves the data type of the columns
print("\n\nData types of final_dataset:\n")
print(final_dataset.dtypes)


In [None]:
#seems like item_cnt_month should be int64
final_dataset['item_cnt_month'] = final_dataset['item_cnt_month'].astype('int64')

In [None]:
print(final_dataset.dtypes)

In [None]:
#prints item_cnt_month column to check whether it is int64
print(final_dataset['item_cnt_month'].head(30))   

In [None]:
print(final_dataset.shape)

In [None]:
#removes -1 and 307980 from item_cnt_month column
#because it is an outlier
#and it is not possible to sell -1 and 307980 items in a day
#because 307980 is the total number of items sold in a day
#which means that the data is incorrect
#and -1 is not possible
#which means that the data is incorrect

final_dataset = final_dataset[(final_dataset['item_cnt_month'] > 0) & (final_dataset['item_cnt_month'] < 307980)]

print(final_dataset.shape)

In [None]:
#deal with the incorrect data in the item_price column
#the item_price should not be negative
#the item_price should not be zero
#the item_price should not be greater than 100000

final_dataset = final_dataset[(final_dataset['item_price'] > 0) & (final_dataset['item_price'] < 100000)]

In [None]:
print(final_dataset.shape)

In [None]:
#handles special characters and formatting in the data set
final_dataset['item_name'] = final_dataset['item_name'].str.replace('[^A-Za-z0-9А-Яа-я]+', ' ')

In [None]:
print(final_dataset.shape)

In [None]:
#handles noise in the data set

#removes the noise in the item_name column
final_dataset['item_name'] = final_dataset['item_name'].str.replace('  ', ' ')

In [None]:
print(final_dataset.head())

In [None]:
#data integration

#creates a new column called revenue
final_dataset['revenue'] = final_dataset['item_cnt_month'] * final_dataset['item_price']


In [None]:
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#feature engineering

#creates a new column called revenue_per_item
final_dataset['revenue_per_item'] = final_dataset['revenue'] / final_dataset['item_cnt_month']


print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#checks whether the revenue_per_item column and revenue column are the same

if final_dataset['revenue_per_item'].equals(final_dataset['revenue']):
    # If they are the same, you can drop one of the columns
    final_dataset.drop(columns=['revenue_per_item'], inplace=True)

print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#data integration and data transformation

#creates a new column called date num
final_dataset['date_num'] = final_dataset['date'].str[:2]

In [None]:
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#data integration

#creates a new column called year num
final_dataset['year_num'] = final_dataset['date'].str[6:]

In [None]:
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
print(final_dataset.shape)
print(final_dataset.info())

In [None]:
# rearrange the columns
final_dataset = final_dataset[['date', 'date_num', 'year_num', 'month_num', 'shop_id', 'shop_name', 'item_id', 'item_name', 'category_id', 'item_category_name', 'item_price', 'item_cnt_month', 'revenue']]

print(final_dataset.shape)
print(final_dataset.info())

In [None]:
#since we already handled the missing and null values in the data set there is no need for imputation

In [None]:
#data profiling

#descriptive statistics
print("\n\nDescriptive statistics of final_dataset:\n")
print(final_dataset.describe())

In [None]:
#data enrichment

#creates a new column called month name
final_dataset['month_name'] = final_dataset['month_num'].replace({0: 'January', 1: 'February', 2: 'March', 3: 'April', 4: 'May', 5: 'June', 6: 'July', 7: 'August', 8: 'September', 9: 'October', 10: 'November', 11: 'December', 12: 'January', 13: 'February', 14: 'March', 15: 'April', 16: 'May', 17: 'June', 18: 'July', 19: 'August', 20: 'September', 21: 'October', 22: 'November', 23: 'December', 24: 'January', 25: 'February', 26: 'March', 27: 'April', 28: 'May', 29: 'June', 30: 'July', 31: 'August', 32: 'September', 33: 'October'})

print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#removes month_num column

final_dataset.drop(columns=['month_num'], inplace=True)

print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#rearange the columns

final_dataset = final_dataset[['date', 'date_num', 'month_name', 'year_num', 'shop_id', 'shop_name', 'item_id', 'item_name', 'category_id', 'item_category_name', 'item_price', 'item_cnt_month', 'revenue']]

print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#imputation is already performed under data preprocessing
#outliers are handled already under data preprocessing

In [None]:
#data binning

#found the bins using the following code
print(final_dataset['item_price'].max())
print(final_dataset['item_price'].min())

#creates a new column called price range
final_dataset['price_range'] = pd.cut(final_dataset['item_price'], bins=[-1, 100, 200, 300, 400, 500, 600, 700, 800, 900, 100000], labels=['0-100', '100-200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800-900', '900-100000'])

In [None]:
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#log transformation
#this will help to address certain issues in the data set and it'll help to enhance the performance of the model

#creates a new column called log_revenue
final_dataset['log_revenue'] = np.log(final_dataset['revenue'])

In [None]:
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#encoding

#encodes the year_num column to 0, 1, 2

final_dataset['year_num'] = final_dataset['year_num'].replace({'2013': 0, '2014': 1, '2015': 2})


print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)


In [None]:
#grouping and aggregation

#grouping the data set by shop_id and year_num
#and aggregating the item_cnt_month column using sum

grouped_by_shop_id_and_year_num = final_dataset.groupby(['shop_id', 'year_num']).agg({'item_cnt_month': 'sum'})

print("\n\nHead of grouped_by_shop_id_and_year_num:\n")
print(grouped_by_shop_id_and_year_num.head(60))
print(grouped_by_shop_id_and_year_num.shape)

In [None]:
#feature split is already performed under data preprocessing


In [None]:
#scaling

#creates a new column called scaled_revenue
final_dataset['scaled_revenue'] = (final_dataset['revenue'] - final_dataset['revenue'].min()) / (final_dataset['revenue'].max() - final_dataset['revenue'].min())

print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)



In [None]:
#change month_name column to numeric

final_dataset['month_name'] = final_dataset['month_name'].replace({'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June':6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November':11, 'December': 12})

print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

## Data Exploration & Analysis

In [None]:

#correlation

numeric_columns = final_dataset.select_dtypes(include=['number'])
print("\n\nCorrelation of final_dataset:\n")
print(numeric_columns.corr())

In [None]:
#missing values treatment

#checks for missing values
print("\n\nMissing values in final_dataset:\n")
print(final_dataset.isnull().sum())

#checks for null values
print("\n\nNull values in final_dataset:\n")
print(final_dataset.isnull().sum())

#since there are no missing values and null values in the data set there is no need for imputation



In [None]:
#outlier treatment

#checks for outliers in the item_cnt_month column
print("\n\nOutliers in item_cnt_month column:\n")
print(final_dataset[final_dataset['item_cnt_month'] > 1000])

#removes the outliers in the item_cnt_month column
final_dataset = final_dataset[final_dataset['item_cnt_month'] < 1000]

print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

In [None]:
#Descriptive analytics

import matplotlib.pyplot as plt
import seaborn as sns

# Summary Statistics
print("\nDescriptive statistics of final_dataset:")
print(final_dataset.describe())

In [None]:
#seasonality analysis

#grouping the data set by month_name
#and aggregating the item_cnt_month column using sum

grouped_by_month_name = final_dataset.groupby(['month_name']).agg({'item_cnt_month': 'sum'})

print("\n\nHead of grouped_by_month_name:\n")
print(grouped_by_month_name)
print(grouped_by_month_name.shape)


In [None]:
import statsmodels.api as sm

#performing seasonal decomposition
decomposition = sm.tsa.seasonal_decompose(grouped_by_month_name, model='additive', period=1)


#plotting the seasonal decomposition
fig = decomposition.plot()
plt.show()

#data visualization

#plotting the item_cnt_month column
plt.figure(figsize=(20, 10))
plt.plot(final_dataset['item_cnt_month'])
plt.title('Item Count Per Month')
plt.xlabel('Month')
plt.ylabel('Item Count')
plt.show()



In [None]:
#regulatory analytics

#grouping the data set by shop_id and year_num
#and aggregating the item_cnt_month column using sum

grouped_by_shop_id_and_year_num = final_dataset.groupby(['shop_id', 'year_num']).agg({'item_cnt_month': 'sum'})

print("\n\nHead of grouped_by_shop_id_and_year_num:\n")
print(grouped_by_shop_id_and_year_num.head(60))

In [None]:
#Variable Identification

# Identify numerical and categorical variables
numerical_vars = final_dataset.select_dtypes(include=['int64', 'float64']).columns
categorical_vars = final_dataset.select_dtypes(include=['object', 'category']).columns

# Print the list of numerical and categorical variables
print("Numerical Variables:")
print(numerical_vars)

print("\nCategorical Variables:")
print(categorical_vars)

In [None]:
#univariate analysis

# for column in final_dataset.columns:
#     # Check the data type of the variable
#     variable_type = final_dataset[column].dtype
    
#     # Summary Statistics
#     summary_stats = final_dataset[column].describe()
    
#     # Visualization
#     plt.figure(figsize=(10, 6))
    
#     # For numerical variables, create a histogram
#     if variable_type in ['int64', 'float64']:
#         sns.histplot(data=final_dataset, x=column, kde=True)
#         plt.title(f'Distribution of {column}')
#         plt.xlabel(column)
#         plt.ylabel('Frequency')
    
#     # For categorical variables, create a bar plot
#     else:
#         sns.countplot(data=final_dataset, x=column)
#         plt.title(f'Counts of {column}')
#         plt.xlabel(column)
#         plt.ylabel('Count')
    
#     plt.show()
    
#     # Print summary statistics
#     print(f"Summary Statistics for {column}:")
#     print(summary_stats)

In [None]:
#bivariate analysis

var1 = 'item_price'
var2 = 'item_cnt_month'

# Check the data types of the chosen variables
var1_type = final_dataset[var1].dtype
var2_type = final_dataset[var2].dtype

# Scatter Plot for Numerical vs. Numerical
if var1_type in ['int64', 'float64'] and var2_type in ['int64', 'float64']:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=final_dataset, x=var1, y=var2)
    plt.title(f'Scatter Plot: {var1} vs. {var2}')
    plt.xlabel(var1)
    plt.ylabel(var2)
    plt.grid(True)
    plt.show()

# Box Plot for Categorical vs. Numerical
elif var1_type in ['object', 'category'] and var2_type in ['int64', 'float64']:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=final_dataset, x=var1, y=var2)
    plt.title(f'Box Plot: {var1} vs. {var2}')
    plt.xlabel(var1)
    plt.ylabel(var2)
    plt.grid(True)
    plt.show()

# Bar Plot for Categorical vs. Categorical (if applicable)
elif var1_type in ['object', 'category'] and var2_type in ['object', 'category']:
    crosstab = pd.crosstab(final_dataset[var1], final_dataset[var2])
    crosstab.plot(kind='bar', stacked=True, figsize=(10, 6))
    plt.title(f'Bar Plot: {var1} vs. {var2}')
    plt.xlabel(var1)
    plt.ylabel('Count')
    plt.grid(True)
    plt.show()

# Print correlation for Numerical vs. Numerical
if var1_type in ['int64', 'float64'] and var2_type in ['int64', 'float64']:
    correlation = final_dataset[[var1, var2]].corr().iloc[0, 1]
    print(f'Correlation between {var1} and {var2}: {correlation:.2f}')


In [None]:
#Exploratory Data Analysis (EDA)

df = final_dataset.copy()

# # Display basic information about the dataset
# print("Dataset Overview:")
# print(df.info())

# # Summary statistics for numerical variables
# print("\nSummary Statistics for Numerical Variables:")
# print(df.describe())

# # Check for missing values
# print("\nMissing Values:")
# print(df.isnull().sum())

# # Visualize data distribution using histograms for numerical variables
# numerical_columns = ['month_name', 'year_num', 'shop_id', 'item_id', 'category_id', 'item_price', 'item_cnt_month', 'revenue', 'log_revenue', 'scaled_revenue']

# for column in numerical_columns:
#     plt.figure(figsize=(8, 4))
#     sns.histplot(data=df, x=column, kde=True, bins=20)
#     plt.title(f'Distribution of {column}')
#     plt.xlabel(column)
#     plt.ylabel('Frequency')
#     plt.show()

# # Visualize relationships between variables with a correlation matrix for numerical variables
# correlation_matrix = df[numerical_columns].corr()
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# plt.title("Correlation Heatmap for Numerical Variables")
# plt.show()

# # Explore categorical variables with bar plots
# categorical_columns = ['shop_name', 'item_name', 'item_category_name', 'price_range']

# for column in categorical_columns:
#     plt.figure(figsize=(10, 6))
#     sns.countplot(data=df, x=column)
#     plt.title(f'Counts of {column}')
#     plt.xlabel(column)
#     plt.ylabel('Count')
#     plt.xticks(rotation=45)
#     plt.show()



In [None]:
#inferential analysis

import scipy.stats as stats

# Generate a hypothetical dataset (replace this with your own data)
np.random.seed(42)
data = np.random.normal(loc=70, scale=10, size=100)

# Create a DataFrame from the generated data
df = pd.DataFrame({'measurement': data})

# Calculate the sample mean and standard deviation
sample_mean = df['measurement'].mean()
sample_std = df['measurement'].std()

# Define a hypothetical population mean for comparison
population_mean = 75 

# Perform a t-test to compare the sample mean with the population mean
t_statistic, p_value = stats.ttest_1samp(df['measurement'], population_mean)

# Print results
print(f"Sample Mean: {sample_mean:.2f}")
print(f"Sample Standard Deviation: {sample_std:.2f}")
print(f"Population Mean: {population_mean}")
print(f"T-Statistic: {t_statistic:.2f}")
print(f"P-Value: {p_value:.4f}")

# Determine statistical significance
alpha = 0.05  # Significance level (adjust as needed)
if p_value < alpha:
    print("Reject the null hypothesis: The sample mean is statistically different from the population mean.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the sample mean and the population mean.")



In [None]:
#dianostic analytics

# Import libraries

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Generate a hypothetical dataset
np.random.seed(42)
X = np.random.rand(100, 1) * 10
y = 3 * X + 2 + np.random.randn(100, 1)

# Create a DataFrame from the generated data
df = pd.DataFrame({'X': X.flatten(), 'y': y.flatten()})

# Diagnostic Plots
plt.figure(figsize=(12, 6))

plt.tight_layout()
plt.show()


In [None]:
#qualitative analytics

# 1. Count the unique values in a categorical column (e.g., item_category_name)
category_counts = final_dataset['item_category_name'].value_counts()
print(category_counts)

# 2. Cross-tabulation to examine relationships between two categorical columns (e.g., shop_name and item_category_name)
cross_tab = pd.crosstab(final_dataset['shop_name'], final_dataset['item_category_name'])
print(cross_tab)

# 3. Calculate the frequency of a particular category within a column
category_frequency = (final_dataset['price_range'] == 'Low').sum()
print(f"Frequency of 'Low' price range: {category_frequency}")

# 4. Grouping and aggregation (e.g., average item price per category)
average_price_per_category = final_dataset.groupby('item_category_name')['item_price'].mean()
print(average_price_per_category)

# 5. Visualizing categorical data (e.g., a bar chart of category counts)
category_counts.plot(kind='bar', figsize=(10, 6))
plt.title('Item Category Counts')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()


In [None]:
#stationarity analysis

%pip install pandas numpy statsmodels

In [None]:
from statsmodels.tsa.stattools import adfuller

# Convert the date column to datetime format
final_dataset['date'] = pd.to_datetime(final_dataset['date'], format='%d.%m.%Y')

# Group by month and aggregate data
monthly_data = final_dataset.groupby(final_dataset['date'].dt.to_period('M')).agg({
    'item_cnt_month': 'sum',
    # Add other columns to aggregate as needed
}).reset_index()

# Check for stationarity using the Augmented Dickey-Fuller test
def adf_test(timeseries):
    result = adfuller(timeseries, autolag='AIC')
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'   {key}: {value}')
    
    if result[1] <= 0.05:
        print("Stationary (Reject the null hypothesis)")
    else:
        print("Non-Stationary (Fail to reject the null hypothesis)")

# Example: Check stationarity for 'item_cnt_month'
item_cnt_month_series = monthly_data['item_cnt_month']

# Plot the time series data
plt.figure(figsize=(12, 6))
plt.plot(item_cnt_month_series)
plt.title('Monthly Item Count Over Time')
plt.xlabel('Date')
plt.ylabel('Item Count')
plt.show()

# Perform ADF test for stationarity
adf_test(item_cnt_month_series)



In [None]:
# #autocorrelation analysis

# import pandas as pd

# # Set the 'date' column as the DataFrame's index
# final_dataset.set_index('date', inplace=True)

# # Sort the DataFrame by date if it's not already sorted
# final_dataset.sort_index(inplace=True)

# # Calculate the autocorrelation for the 'item_cnt_month' column
# autocorrelation = final_dataset['item_cnt_month'].autocorr()

# # Plot the autocorrelation function (ACF)
# plt.figure(figsize=(12, 6))
# pd.plotting.autocorrelation_plot(final_dataset['item_cnt_month'])
# plt.title(f'Autocorrelation for item_cnt_month (lag = 1), Autocorrelation = {autocorrelation:.2f}')
# plt.xlabel('Lag')
# plt.ylabel('Autocorrelation')
# plt.grid(True)
# plt.show()


In [None]:
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)

## Model Building

In [None]:
print("\n\nHead of final_dataset:\n")
print(final_dataset.head(20))
print(final_dataset.shape)
print(final_dataset.info())

### linear regression

In [None]:
#linear regression

# Create a copy of the final_dataset DataFrame
df_lin_reg = final_dataset.copy()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Feature selection and engineering
X = final_dataset[['date_num', 'month_name', 'year_num', 'shop_id', 'item_id', 'category_id', 'item_price', 'revenue']]
y = final_dataset['item_cnt_month']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and fit a linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)



### multiple linear regression

In [None]:
#multiple linear regression

#create a copy of the dataframe
df_multi_linear = final_dataset.copy()

### SVM

In [None]:
#svm

#create a copy of the dataframe
df_svm = final_dataset.copy()

### KNN

In [None]:
#knn

#create a copy of the dataframe
df_knn = final_dataset.copy()

### Decision Tree

In [None]:
# Decision Tree

#create a copy of the dataframe
df_decision_tree = final_dataset.copy()

### random forest

In [None]:
# Random Forest

#create a copy of the dataframe
df_random_forest = final_dataset.copy()

### xtra tree

In [None]:
# xtra trees

#create a copy of the dataframe
df_extra_trees = final_dataset.copy()

### XGBoost

In [None]:
# XGBoost

#create a copy of the dataframe
df_xgboost = final_dataset.copy()

### ridge regression

In [None]:
#ridge regression

#create a copy of the dataframe
df_ridge_regression = final_dataset.copy()

### lasso regression

In [None]:
# lasso regression

#create a copy of the dataframe
df_lasso_regression = final_dataset.copy()

### ARIMA

In [None]:
#ARIMA

#create a copy of the dataframe
df_arima = final_dataset.copy()

## Data Visualization

In [None]:
# # Data Visualization
# # Histograms for numerical columns
# numerical_cols = final_dataset.select_dtypes(include=['number']).columns
# for col in numerical_cols:
#     plt.figure(figsize=(8, 4))
#     sns.histplot(data=final_dataset, x=col, kde=True)
#     plt.title(f'Distribution of {col}')
#     plt.show()

#     # Box plots for numerical columns
# for col in numerical_cols:
#     plt.figure(figsize=(8, 4))
#     sns.boxplot(data=final_dataset, y=col)
#     plt.title(f'Box plot of {col}')
#     plt.show()

# # Correlation Heatmap
# correlation_matrix = numerical_cols.corr()
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# plt.title("Correlation Heatmap")
# plt.show()