In [1]:
import pandas as pd
import numpy as np 
import random
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots

## Datasets Loading and Preperation

In [2]:
products = pd.read_csv("bigbasket_products.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
ratings.columns =['UserId','ProductId', 'Rating', 'Timestamp']

In [4]:
def print_top_unique(listItem,n):
    rating_filter_list = []
    for i in range(0, n):
        rating_filter_list.append(listItem[i])
    return rating_filter_list

rating_filter_list = print_top_unique(ratings['ProductId'].unique(),27555)

In [5]:
ratings = ratings[ratings.ProductId.isin(rating_filter_list)]
rating_user_filter_list = print_top_unique(ratings['UserId'].unique(),20000)

In [6]:
products['ProductId'] = pd.unique(ratings['ProductId'])

In [7]:
randomlist = []
for i in range(0,443293):
    n = random.randint(1,20000)
    randomlist.append(n)
ratings ['uid'] = randomlist
ratings = ratings.drop(['UserId','Timestamp'], axis=1)
products = products.drop(['p_url','eancode'], axis=1)

In [8]:
ratings[ratings['uid']==20000]
products[products['ProductId']=='B00004ZCJE']

Unnamed: 0.1,Unnamed: 0,product,category,sub_category,brand,sale_price,market_price,image_url,type,rating,description,ProductId
5129,5129,Underarm Roll On Deodorant For Women - Powder Dry,Beauty & Hygiene,Fragrances & Deos,Rexona,114.75,135.0,https://www.bigbasket.com/media/uploads/p/s/40...,Women's Deodorants,4.2,"Did you know, sweat trapped in your underarms ...",B00004ZCJE


In [None]:
average_mean = []
for i in range(0, len(products)):  
    average = mean(ratings[ratings['ProductId']==products['ProductId'][i]]['Rating'])
    average_mean.append(average)

In [None]:
average_mean = [ round(elem, 1) for elem in average_mean ]
products['overall_rating'] = average_mean
products = products.drop(['rating'], axis=1)
products.drop(products.columns[products.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [None]:
products

## Exploratory data analysis

In [None]:
products.describe()

In [None]:
products.info()

In [None]:
products.isnull().sum()

In [None]:
products = products[products['product'].notna()]

In [None]:
products.isnull().sum()

In [None]:
products['type'].unique()

In [None]:

fig = make_subplots(rows=1, cols=2)

category_counts = products['category'].value_counts()
category_counts_df =pd.DataFrame({'Category':category_counts.index, "product": category_counts.values})
fig1 = px.bar(category_counts_df, x="Category", y="product", color='product' , title="Category wise sales count", color_continuous_scale='oranges',
 text_auto=True,)

category_amount_sales = products.groupby('category')['sale_price'].sum()
category_amount_sales =pd.DataFrame({'Category':category_amount_sales.index, "sales_price": category_amount_sales.values})
category_amount_sales = category_amount_sales.sort_values('sales_price',ascending=False)
fig2 = px.bar(category_amount_sales, x='Category' , y="sales_price", color='sales_price' , title="Category wise Total sales", color_continuous_scale='oranges',
 text_auto=True,)

fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.update_layout(
    title='Category wise sales count and totla sales chart',
)

fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2)

category_counts = products['sub_category'].value_counts()
category_counts_df =pd.DataFrame({'sub_category':category_counts.index, "Value": category_counts.values})[0:15]
fig1 = px.bar(category_counts_df, x="sub_category", y="Value", color='Value' , title="Sub-category wise sales count", color_continuous_scale='oranges',
 text_auto=True,)

category_amount_sales = products.groupby('sub_category')['sale_price'].sum()
category_amount_sales =pd.DataFrame({'sub_category':category_amount_sales.index, "sales_price": category_amount_sales.values})[0:15]
category_amount_sales = category_amount_sales.sort_values('sales_price',ascending=False)
fig2 = px.bar(category_amount_sales, x='sub_category' , y="sales_price", color='sales_price' , title="Sub-category wise Total sales", color_continuous_scale='oranges',
 text_auto=True,)

fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.update_layout(
    title='Sub-category wise sales count and totla sales chart',
)

fig.show()

In [None]:
category_amount_sales = products.groupby('brand')['sale_price'].sum()
category_amount_sales =pd.DataFrame({'brand':category_amount_sales.index, "sales_price": category_amount_sales.values})

category_amount_sales = category_amount_sales.sort_values('sales_price',ascending=False)[0:30]
fig = px.bar(category_amount_sales, x='brand' , y="sales_price", color='sales_price' , title="Brand wise Total sales", color_continuous_scale='oranges',
 text_auto=True,)
fig.show()

In [None]:
category_amount_sales = products.groupby('product')['sale_price'].sum()
category_amount_sales =pd.DataFrame({'product':category_amount_sales.index, "sales_price": category_amount_sales.values})

category_amount_sales = category_amount_sales.sort_values('sales_price',ascending=False)[0:20]
fig = px.bar(category_amount_sales, x='product' , y="sales_price", color='sales_price' , title="Product wise Total sales", color_continuous_scale='oranges',
 text_auto=True,)
fig.show()

In [None]:
category_amount_sales = products['overall_rating'].value_counts()
category_amount_sales =pd.DataFrame({'rating':category_amount_sales.index, "Value": category_amount_sales.values})
category_amount_sales = category_amount_sales.sort_values('Value',ascending=False)
fig = px.bar(category_amount_sales, x='rating' , y="Value", color='Value' , title="Rating wise product count", color_continuous_scale='oranges',
 text_auto=True,)
fig.show()

In [None]:
discounted_product = products[products['market_price'] - products['sale_price'] != 0]
discounted_product

In [None]:
highrated = products.query('overall_rating > 4', inplace=False)
print("Number of products with more than 4 rating is",highrated.shape[0])

In [None]:
fig = plt.figure(figsize=(20,10))

sns.kdeplot(discounted_product['overall_rating'], shade=True)
sns.kdeplot(products['overall_rating'], shade =True)
plt.xlabel("Ratings",fontsize=15, weight='semibold')
plt.ylabel("Density",fontsize=15, weight='semibold')
plt.title("Relative distribution of all products with discounted products",fontsize=15, weight='semibold')
fig.legend()