In [136]:
import numpy as np
import pandas as pd
from datetime import datetime
import plotly.express as px

In [137]:
campaign_data = pd.read_csv('./dataset/dim_campaigns.csv')
product_data=pd.read_csv('./dataset/dim_products.csv')
stores_data=pd.read_csv('./dataset/dim_stores.csv')

event_data=pd.read_csv('./dataset/fact_events.csv')

In [138]:
campaign_data.info()

campaign_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   campaign_id    2 non-null      object
 1   campaign_name  2 non-null      object
 2   start_date     2 non-null      object
 3   end_date       2 non-null      object
dtypes: object(4)
memory usage: 196.0+ bytes


Unnamed: 0,campaign_id,campaign_name,start_date,end_date
0,CAMP_DIW_01,Diwali,12-11-2023,18-11-2023
1,CAMP_SAN_01,Sankranti,10-01-2024,16-01-2024


In [139]:
#checking the datatype of the date columns

print(type(campaign_data.start_date[0]))
print(type( campaign_data.end_date))

<class 'str'>
<class 'pandas.core.series.Series'>


In [140]:
#changing the datatype of the date columns to datetime
campaign_data['start_date'] = campaign_data['start_date'].apply(pd.to_datetime, format = "%d-%m-%Y")
campaign_data['end_date'] = campaign_data['end_date'].apply(pd.to_datetime, format = "%d-%m-%Y")
print(type(campaign_data.start_date[0]))
print(type( campaign_data.end_date[1]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [141]:
campaign_data.info()
campaign_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   campaign_id    2 non-null      object        
 1   campaign_name  2 non-null      object        
 2   start_date     2 non-null      datetime64[ns]
 3   end_date       2 non-null      datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 196.0+ bytes


Unnamed: 0,campaign_id,campaign_name,start_date,end_date
0,CAMP_DIW_01,Diwali,2023-11-12,2023-11-18
1,CAMP_SAN_01,Sankranti,2024-01-10,2024-01-16


In [142]:
product_data.info()

product_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_code  15 non-null     object
 1   product_name  15 non-null     object
 2   category      15 non-null     object
dtypes: object(3)
memory usage: 492.0+ bytes


Unnamed: 0,product_code,product_name,category
0,P01,Atliq_Masoor_Dal (1KG),Grocery & Staples
1,P02,Atliq_Sonamasuri_Rice (10KG),Grocery & Staples
2,P03,Atliq_Suflower_Oil (1L),Grocery & Staples
3,P04,Atliq_Farm_Chakki_Atta (1KG),Grocery & Staples
4,P05,Atliq_Scrub_Sponge_For_Dishwash,Home Care


In [143]:
stores_data.info()
stores_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   store_id  50 non-null     object
 1   city      50 non-null     object
dtypes: object(2)
memory usage: 932.0+ bytes


Unnamed: 0,store_id,city
0,STTRV-0,Trivandrum
1,STMDU-3,Madurai
2,STHYD-6,Hyderabad
3,STVSK-1,Visakhapatnam
4,STCBE-3,Coimbatore


In [144]:
event_data.info()
event_data.head()
event_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   event_id                     1500 non-null   object
 1   store_id                     1500 non-null   object
 2   campaign_id                  1500 non-null   object
 3   product_code                 1500 non-null   object
 4   base_price                   1500 non-null   int64 
 5   promo_type                   1500 non-null   object
 6   quantity_sold(before_promo)  1500 non-null   int64 
 7   quantity_sold(after_promo)   1500 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 93.9+ KB


Unnamed: 0,base_price,quantity_sold(before_promo),quantity_sold(after_promo)
count,1500.0,1500.0,1500.0
mean,551.966667,139.366667,290.315333
std,741.394953,131.596829,363.444413
min,50.0,10.0,9.0
25%,110.0,45.0,74.0
50%,295.0,78.0,163.0
75%,860.0,217.25,337.0
max,3000.0,642.0,2067.0


We can see that on average the quantity after being sold are more by around 150 units after promotions, in paticular the maximum units sold are a lot higher.

In [145]:
import plotly.graph_objects as go


In [146]:
agg_df = event_data.groupby('base_price', as_index=False).agg({
    'quantity_sold(before_promo)': 'mean',
    'quantity_sold(after_promo)': 'mean'
})


agg_df = agg_df.sort_values('base_price')

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=agg_df['base_price'],
    y=agg_df['quantity_sold(before_promo)'],
    mode='lines+markers',
    name='Before Promo',
    line=dict(color='blue', width=3, shape='spline'),
    marker=dict(size=7, symbol='circle')
))

fig.add_trace(go.Scatter(
    x=agg_df['base_price'],
    y=agg_df['quantity_sold(after_promo)'],
    mode='lines+markers',
    name='After Promo',
    line=dict(color='red', width=3, shape='spline'),
    marker=dict(size=7, symbol='diamond')
))

# Layout polish
fig.update_layout(
    title='Base Price vs Average Quantity Sold (Before & After Promo)',
    xaxis_title='Base Price',
    yaxis_title='Average Quantity Sold',
    template='plotly_white',
    legend_title='Legend',
    hovermode='x unified',
    font=dict(size=14),
    width=900,
    height=500
)

fig.show()

Here we can see that 

In [147]:
fig=px.histogram(event_data,x='promo_type',title='Distribution of Promotion Types',y='quantity_sold(after_promo)')
fig.show()

In [148]:
if 'city' in event_data.columns:
    event_data = event_data.drop(columns=['city'])

event_data = event_data.merge(
    stores_data[['store_id', 'city']],
    on='store_id',
    how='left'
)
event_data


Unnamed: 0,event_id,store_id,campaign_id,product_code,base_price,promo_type,quantity_sold(before_promo),quantity_sold(after_promo),city
0,7f650b,STCBE-2,CAMP_SAN_01,P11,190,50% OFF,34,52,Coimbatore
1,a21f91,STBLR-8,CAMP_DIW_01,P03,156,25% OFF,393,322,Bengaluru
2,78bc80,STVJD-0,CAMP_SAN_01,P07,300,BOGOF,22,85,Vijayawada
3,a1503f,STCBE-1,CAMP_DIW_01,P15,3000,500 Cashback,329,1000,Coimbatore
4,1091cf,STBLR-6,CAMP_DIW_01,P05,55,25% OFF,108,93,Bengaluru
...,...,...,...,...,...,...,...,...,...
1495,1e8961,STBLR-3,CAMP_SAN_01,P12,62,50% OFF,61,84,Bengaluru
1496,f957f1,STMDU-1,CAMP_SAN_01,P10,50,25% OFF,22,18,Madurai
1497,a9ae21,STCBE-0,CAMP_DIW_01,P12,62,50% OFF,80,119,Coimbatore
1498,e5d28d,STVSK-0,CAMP_SAN_01,P13,350,BOGOF,73,282,Visakhapatnam


In [149]:
def clean_data(event_data):
    # Drop column: 'city_y'
    event_data = event_data.drop(columns=['city_y'])
    # Drop column: 'city_x'
    event_data = event_data.drop(columns=['city_x'])
    return event_data

event_data_clean = clean_data(event_data.copy())
event_data_clean.head()

KeyError: "['city_y'] not found in axis"

In [150]:
total_quantity_sold = event_data_clean['quantity_sold(before_promo)'] + event_data_clean['quantity_sold(after_promo)']
event_data_clean['total_quantity_sold'] = total_quantity_sold
event_data_clean.head()


Unnamed: 0,event_id,store_id,campaign_id,product_code,base_price,promo_type,quantity_sold(before_promo),quantity_sold(after_promo),city,total_quantity_sold
0,7f650b,STCBE-2,CAMP_SAN_01,P11,190,50% OFF,34,52,Coimbatore,86
1,a21f91,STBLR-8,CAMP_DIW_01,P03,156,25% OFF,393,322,Bengaluru,715
2,78bc80,STVJD-0,CAMP_SAN_01,P07,300,BOGOF,22,85,Vijayawada,107
3,a1503f,STCBE-1,CAMP_DIW_01,P15,3000,500 Cashback,329,1000,Coimbatore,1329
4,1091cf,STBLR-6,CAMP_DIW_01,P05,55,25% OFF,108,93,Bengaluru,201


In [151]:
print(event_data_clean.city.unique())

['Coimbatore' 'Bengaluru' 'Vijayawada' 'Visakhapatnam' 'Hyderabad'
 'Madurai' 'Mysuru' 'Chennai' 'Trivandrum' 'Mangalore']


In [152]:
# Load your event data and city coordinates data
city_coords = pd.read_csv('./dataset/Indian_cities.csv')
city_coords.head()


Unnamed: 0,name_of_city,state_code,state_name,dist_code,population_total,population_male,population_female,0-6_population_total,0-6_population_male,0-6_population_female,...,literates_female,sex_ratio,child_sex_ratio,effective_literacy_rate_total,effective_literacy_rate_male,effective_literacy_rate_female,location,total_graduates,male_graduates,female_graduates
0,Abohar,3,PUNJAB,9,145238,76840,68398,15870,8587,7283,...,44972,890,848,79.86,85.49,73.59,"30.1452928,74.1993043",16287,8612,7675
1,Achalpur,27,MAHARASHTRA,7,112293,58256,54037,11810,6186,5624,...,43086,928,909,91.99,94.77,89.0,"21.257584,77.5086754",8863,5269,3594
2,Adilabad,28,ANDHRA PRADESH,1,117388,59232,58156,13103,6731,6372,...,37660,982,947,80.51,88.18,72.73,"19.0809075,79.560344",10565,6797,3768
3,Adityapur,20,JHARKHAND,24,173988,91495,82493,23042,12063,10979,...,54515,902,910,83.46,89.98,76.23,"22.7834741,86.1576889",19225,12189,7036
4,Adoni,28,ANDHRA PRADESH,21,166537,82743,83794,18406,9355,9051,...,45089,1013,968,68.38,76.58,60.33,"15.6322227,77.2728368",11902,7871,4031


In [193]:
# --- Step 1: Fix city names in city_coords ---
city_name_map = {
    "mysore": "Mysuru",
    "bangalore": "Bengaluru",
    "bengalooru": "Bengaluru",
    "thiruvananthapuram": "Trivandrum",
    "trivandram": "Trivandrum",
    "greater hyderabad": "Hyderabad",
    "hyderabad city": "Hyderabad",
    "madras": "Chennai",
    "coimbatore city": "Coimbatore",
    "vijayawada town": "Vijayawada",
    "visakhapatnam city": "Visakhapatnam",
    "madurai city": "Madurai",
    "mangalore city": "Mangalore"
}

city_coords['name_of_city'] = city_coords['name_of_city'].replace(city_name_map)



In [194]:
# --- Step 2: Merge city coordinates with event data ---
merged_data = pd.merge(
    event_data_clean, 
    city_coords[['name_of_city', 'location']], 
    left_on='city', 
    right_on='name_of_city', 
    how='left'
)


In [195]:
print(event_data_clean['city'].unique())



['coimbatore' 'bengaluru' 'vijayawada' 'visakhapatnam' 'hyderabad'
 'madurai' 'mysuru' 'chennai' 'trivandrum' 'mangalore']


In [196]:
print(city_coords['name_of_city'].unique())

['abohar' 'achalpur' 'adilabad' 'adityapur' 'adoni' 'agartala' 'agra'
 'ahmadabad' 'ahmadnagar' 'aizawl' 'ajmer' 'akbarpur' 'akola' 'alandur'
 'alappuzha' 'aligarh' 'allahabad' 'alwar' 'ambala' 'ambala sadar'
 'ambarnath' 'ambattur' 'ambikapur' 'ambur' 'amravati' 'amreli' 'amritsar'
 'amroha' 'anand' 'anantapur' 'anantnag' 'arrah' 'asansol'
 'ashoknagar kalyangarh' 'aurangabad' 'avadi' 'azamgarh' 'badlapur'
 'bagaha' 'bagalkot' 'bahadurgarh' 'baharampur' 'bahraich' 'baidyabati'
 'baleshwar town' 'ballia' 'bally' 'bally city' 'balurghat' 'banda'
 'bankura' 'bansberia' 'banswara' 'baran' 'baranagar' 'barasat' 'baraut'
 'barddhaman' 'bareilly' 'baripada town' 'barnala' 'barrackpur' 'barshi'
 'basirhat' 'basti' 'batala' 'bathinda' 'beawar' 'begusarai' 'belgaum'
 'bellary' 'bengaluru' 'bettiah' 'betul' 'bhadrak' 'bhadravati'
 'bhadreswar' 'bhagalpur' 'bhalswa jahangir pur' 'bharatpur' 'bharuch'
 'bhatpara' 'bhavnagar' 'bhilai nagar' 'bhilwara' 'bhimavaram' 'bhind'
 'bhiwadi' 'bhiwandi' 'bhi

In [197]:
event_data_clean['city'] = event_data_clean['city'].str.strip().str.lower()
city_coords['name_of_city'] = city_coords['name_of_city'].str.strip().str.lower()


In [198]:
# --- Step 3: Aggregate total quantity sold per city ---
city_sales = merged_data.groupby(
    ['city', 'location'], 
    as_index=False
)['total_quantity_sold'].sum()


In [199]:
city_sales[['lat', 'lng']] = city_sales['location'].str.split(',', expand=True).astype(float)
city_sales
#split the location column into lat and lng

Unnamed: 0,city,location,total_quantity_sold,lat,lng
0,bengaluru,"12.9716,77.5946",154312,12.9716,77.5946
1,chennai,"13.0826802,80.2707184",122778,13.08268,80.270718
2,coimbatore,"11.0168445,76.9558321",57050,11.016844,76.955832
3,hyderabad,"17.3850,78.4867",103762,17.385,78.4867
4,madurai,"9.9252007,78.1197754",45627,9.925201,78.119775
5,mangalore,"12.9141417,74.8559568",22458,12.914142,74.855957
6,mysuru,"12.2958104,76.6393805",56039,12.29581,76.639381
7,trivandrum,"8.5241391,76.9366376",15003,8.524139,76.936638
8,vijayawada,"16.5061743,80.6480153",16403,16.506174,80.648015
9,visakhapatnam,"17.6868,83.2185",51091,17.6868,83.2185


In [200]:
# --- Step 4: Plot the bubble map for India ---
fig = px.scatter_geo(
    city_sales,
    lat='lat',
    lon='lng',
    scope='asia',  # zooms in on Asia; we'll focus on India
    hover_name='city',
    size='total_quantity_sold',
    projection='natural earth',
    title='Total Sales per City in India',
    size_max=40  # adjust max bubble size
)

In [201]:
fig.update_geos(
    showcountries=True, countrycolor="Black",
    showsubunits=True, subunitcolor="Gray",
    lataxis_range=[6, 38], lonaxis_range=[68, 98]  # India lat/lon bounds
)

fig.show()