<a href="https://colab.research.google.com/github/Sifatkhan-1915020/deeplearning-/blob/main/Demand%20Forcasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-products-dataset-2023-1-4m-products:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3798081%2F7643327%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241012%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241012T152908Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D37a9410036954557b649b1484d97454b3f8918c8112afc77d7e4c6821e5b3e14cfb597f0455805412508169e981523cf547ae750fc8049a4df5e0387e9edfcb4720b7f5f28492c86dbc3902f5293445b6dc266e3038e2510eb389794173817359c9755f2a08d9a6027e7516d9c307ebf5b3ef0d306ef5cef69e02f374329fa7e0fa2c5eeff364cf8e6ed54340bdb3967df9eca7fa5f3c02c01241e63d63f8cbaca812fd77f3bdc41f6fcc7b3baf261a7095de9dad80e0bdbfa96113efc98538a5a2d0a0bacd3c9f68478773efbc51c246e5c1980890f2a8bb323bacc09d37339eba4c94f6b02989141708327fb075ce1ef4df6c8a8a1abc650ec7eea4947c6b3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# # Amazon Products Dataset 2023
**This dataset contains information on approximately 1.4 million Amazon products, including details such as product titles, prices, ratings, and categories.**

***Import Libraries  For Analysing***

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


**Import The Dataset**

In [None]:
c_df = pd.read_csv('/kaggle/input/amazon-products-dataset-2023-1-4m-products/amazon_categories.csv')
c_df.head(5)

In [None]:
c_df.info()

In [None]:
p_df = pd.read_csv('/kaggle/input/amazon-products-dataset-2023-1-4m-products/amazon_products.csv')
p_df.info()
p_df.head(5)

In [None]:

print(p_df.info())

In [None]:
p_df.isnull().sum()

In [None]:
p_df['title'] = p_df['title'].fillna('Untitled Product') #filling null values of the title

> Removing reviews from the table, as i thought it was not necessary

In [None]:
p_df.drop('reviews', axis=1, inplace=True)

**Merging  Categery and Products**

In [None]:
c_df = c_df.rename(columns={'id': 'category_id'}) #Renaming the id column of category table to merge 2 tables

In [None]:
df = pd.merge(p_df, c_df, on='category_id', how='left')
df.head()

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)
df.describe()

In [None]:
# Check for any remaining null values
print(df.isnull().sum())

# Check unique values in category_id
print(df['category_id'].nunique())

# Check the range of boughtInLastMonth
print(df['boughtInLastMonth'].min(), df['boughtInLastMonth'].max())

> Graphically showing the Top 10 Categories by Product Count using Bar Chart

In [None]:
custom_palette = sns.color_palette("Blues_r", n_colors=10)

plt.figure(figsize=(12, 6))
top_categories = df['category_name'].value_counts().nlargest(10)
sns.barplot(x=top_categories.values, y=top_categories.index, palette=custom_palette)
plt.title('Top 10 Categories by Product Count')
plt.xlabel('Number of Products')
plt.ylabel('Category')
plt.show()

> Price Distribution by Category (Top 5 categories) using BoxPlot

In [None]:
plt.figure(figsize=(12, 6))
top_5_categories = df['category_name'].value_counts().nlargest(5).index
sns.boxplot(x='category_name', y='price', data=df[df['category_name'].isin(top_5_categories)])
plt.title('Price Distribution by Category (Top 5)')
plt.xlabel('Category')
plt.ylabel('Price ($)')
plt.xticks(rotation=45)
plt.show()

* This suggests that there might be a unique or premium product within the ‘Toys & Games’ category.

>Average Rating vs. Best Seller Status

In [None]:
custom_palette = sns.color_palette("Blues_r", n_colors=2)
plt.figure(figsize=(10, 6))
sns.boxplot(x='isBestSeller', y='stars', data=df)
plt.title('Average Rating vs. Best Seller Status')
plt.xlabel('Is Best Seller')
plt.ylabel('Rating (Stars)')
plt.show()

* Best sellers tend to have higher median ratings.
* Non-best sellers have more outliers, indicating greater variation in     ratings.

> DASHBOARD

In [None]:
# Calculate some summary statistics
total_products = len(df)
avg_price = df['price'].mean()
best_sellers = df['isBestSeller'].sum()
top_category = df['category_name'].mode()[0]

# Create a summary figure
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
fig.suptitle('Amazon Products Analysis Dashboard', fontsize=20)

# Top 10 Categories by Product Count
custom_palette = sns.color_palette("Blues_r", n_colors=10)

top_categories = df['category_name'].value_counts().nlargest(10)
sns.barplot(x=top_categories.values, y=top_categories.index, palette=custom_palette,ax=axs[0, 0])
axs[0, 0].set_title('Top 10 Categories by Product Count')
axs[0, 0].set_xlabel('Number of Products')
axs[0, 0].set_ylabel('Category'),


# Price Distribution by Category (Top 5)
top_5_categories = df['category_name'].value_counts().nlargest(5).index
sns.boxplot(x='category_name', y='price', data=df[df['category_name'].isin(top_5_categories)], ax=axs[0, 1])
axs[0, 1].set_title('Price Distribution by Category (Top 5)')
axs[0, 1].set_xlabel('Category')
axs[0, 1].set_ylabel('Price ($)')
axs[0, 1].tick_params(axis='x', rotation=45)

# Average Rating vs. Best Seller Status
sns.boxplot(x='isBestSeller', y='stars', data=df, ax=axs[1, 0])
axs[1, 0].set_title('Average Rating vs. Best Seller Status')
axs[1, 0].set_xlabel('Is Best Seller')
axs[1, 0].set_ylabel('Rating (Stars)')

# Summary statistics
axs[1, 1].axis('off')
axs[1, 1].text(0.1, 0.9, f'Total Products: {total_products:,}', fontsize=18)
axs[1, 1].text(0.1, 0.7, f'Average Price: ${avg_price:.2f}', fontsize=18)
axs[1, 1].text(0.1, 0.5, f'Number of Best Sellers: {best_sellers:,}', fontsize=18)
axs[1, 1].text(0.1, 0.3, f'Top Category: {top_category}', fontsize=18)

plt.tight_layout()
plt.show()

# > Conclusion
From this Amazon's Products data set i found that gilr's clothing have more products.It is the top category. The total number of best sellers are 8520.Best sellers tend to have higher median ratings.And Non-best sellers have more outliers, indicating greater variation in ratings.
  comes to Price distribution by Category The ‘Toys & Games’ category has an outlier.It suggests that there might be a unique or premium product within the ‘Toys & Games’ category.