# Code

### Importing libraries and uploading dataset

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
diwali_sales = pd.read_csv('cleaned_Diwali_sales.csv')

### Checking and fixing any inconsistencies

In [3]:
diwali_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        11251 non-null  int64  
 1   User_ID           11251 non-null  int64  
 2   Cust_name         11251 non-null  object 
 3   Product_ID        11251 non-null  object 
 4   Gender            11251 non-null  object 
 5   Age Group         11251 non-null  object 
 6   Age               11251 non-null  int64  
 7   Marital_Status    11251 non-null  int64  
 8   State             11251 non-null  object 
 9   Zone              11251 non-null  object 
 10  Occupation        11251 non-null  object 
 11  Product_Category  11251 non-null  object 
 12  Orders            11251 non-null  int64  
 13  Amount            11239 non-null  float64
 14  age_group         11115 non-null  object 
dtypes: float64(1), int64(5), object(9)
memory usage: 1.3+ MB


In [4]:
#Columns Unamed: 0 was deleted because it was an index that become a column during the uploading process 
diwali_sales = diwali_sales.drop(columns = 'Unnamed: 0')

In [5]:
#Replacing F and M for Female and Male
diwali_sales['Gender'] = diwali_sales['Gender'].replace({'F':'Female','M':'Male'})

In [6]:
diwali_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   User_ID           11251 non-null  int64  
 1   Cust_name         11251 non-null  object 
 2   Product_ID        11251 non-null  object 
 3   Gender            11251 non-null  object 
 4   Age Group         11251 non-null  object 
 5   Age               11251 non-null  int64  
 6   Marital_Status    11251 non-null  int64  
 7   State             11251 non-null  object 
 8   Zone              11251 non-null  object 
 9   Occupation        11251 non-null  object 
 10  Product_Category  11251 non-null  object 
 11  Orders            11251 non-null  int64  
 12  Amount            11239 non-null  float64
 13  age_group         11115 non-null  object 
dtypes: float64(1), int64(4), object(9)
memory usage: 1.2+ MB


In [7]:
#Checking the differences between two similar columns.
#'Age Group' columns has more grouped options than 'age_group' columns and no null values.
print(diwali_sales['Age Group'].unique())
print(diwali_sales['age_group'].unique())

['26-35' '0-17' '18-25' '55+' '46-50' '51-55' '36-45']
['19-30' '31-40' '11-18' '51-80' '41-50' nan]


In [8]:
#Removing age_group column
diwali_sales = diwali_sales.drop(columns = 'age_group')

In [9]:
diwali_sales.sample()

Unnamed: 0,User_ID,Cust_name,Product_ID,Gender,Age Group,Age,Marital_Status,State,Zone,Occupation,Product_Category,Orders,Amount
3879,1005526,Black,P00109542,Male,36-45,43,0,Uttar Pradesh,Central,Hospitality,Sports Products,2,10865.0


In [10]:
# Converting binary data to categorical data
diwali_sales['Marital_Status'] = diwali_sales['Marital_Status'].replace({0:'Not married',1:'Married'})

In [11]:
diwali_sales.sample(1)

Unnamed: 0,User_ID,Cust_name,Product_ID,Gender,Age Group,Age,Marital_Status,State,Zone,Occupation,Product_Category,Orders,Amount
2606,1001451,Sink,P00016742,Female,36-45,41,Not married,Maharashtra,Western,Media,Games & Toys,2,13143.0


### First analysis

In [12]:
total_spent_by_age_group = diwali_sales.groupby(['Age Group'])['Amount'].sum().reset_index(name = 'Total spent').sort_values(by = 'Total spent',ascending = False)

In [13]:
total_spent_by_age = diwali_sales.groupby(['Age'])['Amount'].sum().reset_index(name = 'Total spent').sort_values(by = 'Total spent',ascending = False)

In [20]:
total_spent_by_age

Unnamed: 0,Age,Total spent
18,30,4749880.00
14,26,4686508.99
16,28,4351575.45
23,35,4344816.00
17,29,4276587.50
...,...,...
48,60,72321.00
45,57,68553.00
79,91,66715.00
50,62,54842.00


In [31]:
fig_by_age = px.histogram(
    total_spent_by_age,
    x = 'Age',
    y = 'Total spent',
    text_auto=".2s"
    )
fig_by_age.update_traces(
    textposition = 'outside'
)
fig_by_age

In [14]:
total_spent_by_gender = diwali_sales.groupby(['Gender'])['Amount'].sum().reset_index(name = 'Total spent').sort_values(by = 'Total spent',ascending = False)

In [15]:
total_spent_by_marital_status = diwali_sales.groupby(['Marital_Status'])['Amount'].sum().reset_index(name = 'Total spent').sort_values(by = 'Total spent',ascending = False)

In [16]:
total_spent_by_category = diwali_sales.groupby(['Product_Category'])['Amount'].sum().reset_index(name = 'Total spent').sort_values(by = 'Total spent',ascending = False)

In [17]:
total_spent_by_state = diwali_sales.groupby(['State'])['Amount'].sum().reset_index(name = 'Total spent').sort_values(by = 'Total spent',ascending = False)

In [18]:
total_spent_by_occupation = diwali_sales.groupby(['Occupation'])['Amount'].sum().reset_index(name = 'Total spent').sort_values(by = 'Total spent',ascending = False)

In [19]:
total_spent_by_occupation = diwali_sales.groupby(['Zone'])['Amount'].sum().reset_index(name = 'Total spent').sort_values(by = 'Total spent',ascending = False)