#**SALES DATA ANALYSIS AND FORECASTING**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Load dataset
data_path = '/content/drive/MyDrive/Colab Notebooks/Algonive_Internship/SALES DATA ANALYSIS AND FORECASTING/retail_sales_dataset.csv'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv(data_path)

## **Inspect Dataset**

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.describe()

## **Data Cleaning**

In [None]:
df.isnull()

In [None]:
df.duplicated()

In [None]:
# drop the duplicate if there any duplicated value
df = df.drop_duplicates()


In [None]:
# convert the date column in datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")


In [None]:
df.info()

## Feature Creation

In [None]:
# create year, month, day from date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['month_name'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day

In [None]:
df.head()

## **Univariate Analysis**

In [None]:
df.columns

In [None]:
# Total Sales
df['total_amount'].describe()

In [None]:
plt.hist(df['total_amount'], bins=10)
plt.title("Distribution of total sale")
plt.xlabel('Total Sales')
plt.ylabel("frequency")
plt.show()

In [None]:
# Product Category
df['product_category'].value_counts()

In [None]:

plt.hist(df['product_category'])
plt.title("Distribution of Product Category")
plt.xlabel('Product Category')
plt.ylabel("frequency")
plt.show()

In [None]:
# gender
df['gender'].value_counts()

In [None]:
plt.hist(df['gender'])
plt.title("Distribution of gender")
plt.xlabel('gender')
plt.ylabel("frequency")
plt.show()

## **Bivariate Analysis**

In [None]:
# Monthly Sales Trend
monthly_sales = df.groupby('month')['total_amount'].sum()

monthly_sales.plot(kind='line')
plt.title("Monthly Sales Trend")
plt.xlabel("Month")
plt.ylabel("Total Sales")
plt.show()


In [None]:
# Sales by product category
category_sales = df.groupby('product_category')['total_amount'].sum()

category_sales.plot(kind='bar')
plt.title("Sales by Product Category")
plt.xlabel("Product Category")
plt.ylabel("Total Sales")
plt.show()


In [None]:
# Quantity vs Sales
df.groupby('quantity')['total_amount'].sum().plot(kind='bar')
plt.title("Quantity vs Total Sales")
plt.xlabel("Quantity")
plt.ylabel("Total Sales")
plt.show()


## **Multivariate Analysis**

In [None]:
# Sales by category and month
category_month_sales = df.groupby(['month', 'product_category'])['total_amount'].sum().unstack()
category_month_sales


In [None]:
category_month_sales.plot()
plt.title("Monthly Sales by Product Category")
plt.xlabel("Month")
plt.ylabel("Total Sales")
plt.show()


## **Sales Trend Analysis**

In [None]:
# Monthly sales
monthly_sales = df.groupby('month')['total_amount'].sum()
monthly_sales


In [None]:
monthly_sales.plot(kind='line', marker='o')
plt.title("Overall Monthly Sales Trend")
plt.xlabel("Month")
plt.ylabel("Total Sales")
plt.show()


In [None]:
# Best month and worst month
best_month = monthly_sales.idxmax()
worst_month = monthly_sales.idxmin()

best_month, worst_month


## **Visualization**

In [None]:
# Total sales by product category
df.groupby('product_category')['total_amount'].sum().plot(kind='bar', color='skyblue')
plt.title("Total Sales by Product Category")
plt.xlabel("Product Category")
plt.ylabel("Total Sales")
plt.show()


In [None]:
# Sales trend over month
df.groupby('month')['total_amount'].sum().plot(kind='line', marker='o', color='green')
plt.title("Monthly Sales Trend")
plt.xlabel("Month")
plt.ylabel("Total Sales")
plt.show()


In [None]:
# top 5 customer by total purchase
top_customers = df.groupby('customer_id')['total_amount'].sum().sort_values(ascending=False).head(5)

top_customers.plot(kind='bar', color='orange')
plt.title("Top 5 Customers by Total Purchase")
plt.xlabel("Customer ID")
plt.ylabel("Total Sales")
plt.show()


In [None]:
# Genderwise sales
df.groupby('gender')['total_amount'].sum().plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title("Sales by Gender")
plt.ylabel("")  # removes unnecessary y-label
plt.show()
