In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [2]:
from plotly.subplots import make_subplots
import numpy as np

In [3]:
results = pd.read_csv('results.csv')
advertisers = pd.read_csv('advertisers.csv')
locations = pd.read_csv('locations.csv')

results.head()

Unnamed: 0,_id,Sl No,State,PC_Name,Total Electors,Polled (%),Total Votes,Phase
0,1,1.0,Andaman & Nicobar Islands,Andaman & Nicobar Islands,315148,64.1,202018,1.0
1,2,2.0,Arunachal Pradesh,Arunachal East,375310,83.31,312658,1.0
2,3,3.0,Arunachal Pradesh,Arunachal West,517384,73.6,380783,1.0
3,4,4.0,Assam,Dibrugarh,1659588,76.75,1273744,1.0
4,5,5.0,Assam,Jorhat,1727121,79.89,1379749,1.0


In [4]:
advertisers.head()

Unnamed: 0,Page ID,Page name,Disclaimer,Amount spent (INR),Number of ads in Library
0,121440000000000.0,Bharatiya Janata Party (BJP),Bharatiya Janata Party (BJP),193854342,43455
1,351616000000000.0,Indian National Congress,Indian National Congress,108787100,846
2,132715000000000.0,Ama Chinha Sankha Chinha,Ama Chinha Sankha Chinha,73361399,1799
3,192856000000000.0,Ama Chinha Sankha Chinha,Ama Chinha Sankha Chinha,32294327,680
4,109470000000000.0,Ellorum Nammudan,Populus Empowerment Network Private Limited,22399499,879


In [5]:
results.head()

Unnamed: 0,_id,Sl No,State,PC_Name,Total Electors,Polled (%),Total Votes,Phase
0,1,1.0,Andaman & Nicobar Islands,Andaman & Nicobar Islands,315148,64.1,202018,1.0
1,2,2.0,Arunachal Pradesh,Arunachal East,375310,83.31,312658,1.0
2,3,3.0,Arunachal Pradesh,Arunachal West,517384,73.6,380783,1.0
3,4,4.0,Assam,Dibrugarh,1659588,76.75,1273744,1.0
4,5,5.0,Assam,Jorhat,1727121,79.89,1379749,1.0


The results data has a column named state, and the location data has a column named location name. We will merge these datasets using these columns:

In [6]:
results['State'] = results['State'].str.strip().str.lower()

locations['Location name'] = locations['Location name'].str.strip().str.lower()

merged_data = results.merge(
    locations,
    left_on='State',
    right_on='Location name',
    how='left'
)

merged_data.head()

Unnamed: 0,_id,Sl No,State,PC_Name,Total Electors,Polled (%),Total Votes,Phase,Location name,Amount spent (INR)
0,1,1.0,andaman & nicobar islands,Andaman & Nicobar Islands,315148,64.1,202018,1.0,,
1,2,2.0,arunachal pradesh,Arunachal East,375310,83.31,312658,1.0,arunachal pradesh,1385654.0
2,3,3.0,arunachal pradesh,Arunachal West,517384,73.6,380783,1.0,arunachal pradesh,1385654.0
3,4,4.0,assam,Dibrugarh,1659588,76.75,1273744,1.0,assam,17478091.0
4,5,5.0,assam,Jorhat,1727121,79.89,1379749,1.0,assam,17478091.0


Lets have a look of total state spending

In [7]:
# Set the default template
pio.templates.default = "plotly_dark"

# Group and sum the data
state_ad_spend = merged_data.groupby('State', as_index=False)['Amount spent (INR)'].sum()

# Create the bar plot
fig = px.bar(state_ad_spend,
             x='State',
             y='Amount spent (INR)',
             color='Amount spent (INR)',  # Add color gradient
             color_continuous_scale='Viridis',  # Change color scale
             labels={'State': 'Indian State', 'Amount spent (INR)': 'Advertisement Expenditure (INR)'},
             title='Total Advertisement Expenditure by Indian State')

# Update the layout
fig.update_layout(
    xaxis_categoryorder='total descending',
    xaxis_tickangle=-45,  # Changed from -90 to -45 for better readability
    width=1000,  # Increased width
    height=700,  # Increased height
    showlegend=False,  # Hide the legend
    coloraxis_colorbar=dict(title='Expenditure (INR)'),  # Add colorbar title
)

# Add hover data
fig.update_traces(hovertemplate='<b>%{x}</b><br>Expenditure: ₹%{y:,.0f}<extra></extra>')

# Show the figure
fig.show()

The bar graph shows the total ad spend (in INR) by state. Uttar Pradesh leads significantly with the highest ad spend, followed by Maharashtra and Odisha. States like West Bengal, Tamil Nadu, Andhra Pradesh, and Bihar also show substantial ad expenditures. In contrast, states such as Lakshadweep, Dadra & Nagar Haveli, Daman & Diu, Andaman & Nicobar Islands, and Arunachal Pradesh have the lowest ad spend. It indicates that larger and more populous states tend to spend more on ads, likely reflecting their greater political significance and larger voter base.

Now, let’s have a look at the average voter turnout by state:

In [8]:

# Calculate average voter turnout by state
state_voter_turnout = merged_data.groupby('State', as_index=False)['Polled (%)'].mean()

# Sort the data by voter turnout in descending order
state_voter_turnout = state_voter_turnout.sort_values('Polled (%)', ascending=False)

# Create a color scale based on turnout percentages
color_scale = px.colors.sequential.Viridis

# Create the bar plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=state_voter_turnout['State'],
    y=state_voter_turnout['Polled (%)'],
    marker_color=state_voter_turnout['Polled (%)'],
    marker_colorscale=color_scale,
    text=state_voter_turnout['Polled (%)'].round(2),
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Turnout: %{y:.2f}%<extra></extra>'
))

# Update the layout
fig.update_layout(
    title={
        'text': 'Average Voter Turnout by State in India',
        'font': {'size': 24},
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title='State',
    yaxis_title='Voter Turnout (%)',
    xaxis_tickangle=-45,
    width=1000,
    height=700,
    plot_bgcolor='rgba(0,0,0,0)',
    yaxis=dict(
        tickmode='linear',
        tick0=0,
        dtick=10,
        range=[0, max(state_voter_turnout['Polled (%)']) + 5]
    ),
    coloraxis_colorbar=dict(title='Turnout (%)'),
)

# Add a reference line for national average
national_avg = state_voter_turnout['Polled (%)'].mean()
fig.add_hline(y=national_avg, line_dash="dash", line_color="red",
              annotation_text=f"National Average: {national_avg:.2f}%",
              annotation_position="top right")

fig.show()

Lakshadweep has the highest average voter turnout at nearly 80%, followed closely by Tripura and Assam. States like Andhra Pradesh, Sikkim, and West Bengal also show high voter engagement, with turnouts above 70%. On the other end of the spectrum, states such as Bihar, Uttar Pradesh, and Uttarakhand have the lowest average voter turnout, around 50-55%. It indicates significant regional variations in voter participation, with some smaller states and union territories exhibiting higher engagement compared to larger states with higher ad spend.

Now, let’s have a look at the top 5 parties by ad spend:

In [9]:
# Convert 'Amount spent (INR)' to numeric and handle NaN values
advertisers['Amount spent (INR)'] = pd.to_numeric(advertisers['Amount spent (INR)'], errors='coerce')
advertisers.dropna(subset=['Amount spent (INR)'], inplace=True)

# Group by 'Page name' and sum the amount spent
party_ad_spend = advertisers.groupby('Page name')['Amount spent (INR)'].sum().sort_values(ascending=False)

# Get top 5 parties
top_5_parties = party_ad_spend.head(5).reset_index()

# Calculate percentage for each party
total_spend = top_5_parties['Amount spent (INR)'].sum()
top_5_parties['Percentage'] = top_5_parties['Amount spent (INR)'] / total_spend * 100

# Custom color palette
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']

# Create the pie chart
fig = go.Figure(data=[go.Pie(
    labels=top_5_parties['Page name'],
    values=top_5_parties['Amount spent (INR)'],
    hole=.3,
    marker_colors=colors,
    textinfo='label+percent',
    hoverinfo='label+value+percent',
    textfont_size=12,
    insidetextorientation='radial'
)])

# Update layout
fig.update_layout(
    title={
        'text': 'Top 5 Political Parties by Advertisement Expenditure',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=20)
    },
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.1,
        xanchor="center",
        x=0.5
    ),
    annotations=[
        dict(
            text=f'Total Spend:<br>₹{total_spend:,.0f}',
            x=0.5, y=0.5,
            font_size=14,
            showarrow=False
        )
    ]
)

# Add hover data with formatted currency
fig.update_traces(
    hovertemplate="<b>%{label}</b><br>Amount: ₹%{value:,.0f}<br>Percentage: %{percent:.1f}%<extra></extra>"
)

fig.show()

The Bharatiya Janata Party (BJP) has the highest ad spend, accounting for 42.3% of the total. This is followed by the Ama Chinha Sankha Chinha party at 24.5% and the Indian National Congress at 23.7%. Ellorum Nammudan and BJP Odisha have significantly lower ad spends, at 5.19% and 4.27%, respectively. It indicates that BJP dominates in terms of ad spending on Facebook and Instagram ads, with nearly half of the total expenditure, suggesting a significant investment in advertising compared to other parties.

Now, let’s have a look at the correlation between ad spend and voter turnout:

In [10]:
# calculate the correlation between ad spend and voter turnout
correlation = merged_data[['Amount spent (INR)', 'Polled (%)']].corr()
print(correlation)

                    Amount spent (INR)  Polled (%)
Amount spent (INR)            1.000000   -0.010688
Polled (%)                   -0.010688    1.000000


The correlation matrix shows that the relationship between the amount spent (INR) and the percentage of votes polled (%) is very weak and slightly negative, with a correlation coefficient of -0.010688. This indicates that there is virtually no linear relationship between ad spend and voter turnout. In other words, increasing the amount spent on advertising does not significantly affect the percentage of voter turnout.

Now, let’s have a look at the relationship between ad spend and voter turnout by parliamentary constituency:

In [11]:
from scipy import stats  # We'll use scipy for the trendline calculation

# Merge the data
merged_constituency_data = results.merge(
    locations,
    left_on='State',
    right_on='Location name',
    how='left'
)

# Create a subplot with a scatter plot and marginal distributions
fig = make_subplots(rows=2, cols=2,
                    column_widths=[0.8, 0.2],
                    row_heights=[0.2, 0.8],
                    specs=[[{"type": "histogram"}, None],
                           [{"type": "scatter"}, {"type": "histogram"}]])

# Add the scatter plot
scatter = go.Scatter(
    x=merged_constituency_data['Amount spent (INR)'],
    y=merged_constituency_data['Polled (%)'],
    mode='markers',
    marker=dict(
        size=8,
        color=merged_constituency_data['Amount spent (INR)'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Ad Spend (INR)')
    ),
    text=merged_constituency_data['State'],
    hovertemplate='<b>%{text}</b><br>Ad Spend: ₹%{x:,.0f}<br>Turnout: %{y:.2f}%<extra></extra>'
)
fig.add_trace(scatter, row=2, col=1)

# Add marginal distributions
fig.add_trace(go.Histogram(x=merged_constituency_data['Amount spent (INR)'], name='Ad Spend Dist.'), row=1, col=1)
fig.add_trace(go.Histogram(y=merged_constituency_data['Polled (%)'], name='Turnout Dist.'), row=2, col=2)

# Update layout
fig.update_layout(
    title={
        'text': 'Ad Spend and Voter Turnout by Parliamentary Constituency',
        'font': {'size': 20},
        'x': 0.5,
        'xanchor': 'center'
    },
    width=1000,
    height=800,
    showlegend=False,
    xaxis_title='Ad Spend (INR)',
    yaxis_title='Voter Turnout (%)',
)

# Update axes
fig.update_xaxes(title_text="Ad Spend (INR)", row=2, col=1)
fig.update_yaxes(title_text="Voter Turnout (%)", row=2, col=1)

# Add a trendline using scipy
x = merged_constituency_data['Amount spent (INR)']
y = merged_constituency_data['Polled (%)']
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line = slope * x + intercept
fig.add_trace(go.Scatter(x=x, y=line, mode='lines', name='Trendline',
                         line=dict(color='red', dash='dash')), row=2, col=1)

# Show the plot
fig.show()

In [12]:
fig = px.histogram(merged_data, x='Amount spent (INR)', nbins=30, marginal='box',
                   labels={'Amount spent (INR)': 'Ad Spend (INR)'},
                   title='Distribution of Ad Spend')

fig.update_traces(marker=dict(line=dict(color='black', width=1)))
fig.update_layout(bargap=0.1, width=800, height=600)

fig.show()

In [13]:
# Create the main histogram figure
fig = px.histogram(merged_data,
                   x='Amount spent (INR)',
                   nbins=30,
                   marginal='box',
                   labels={'Amount spent (INR)': 'Ad Spend (INR)'},
                   title='Distribution of Ad Spend',
                   color_discrete_sequence=['#3498db'])  # Set a custom color

# Update the main histogram
fig.update_traces(marker=dict(line=dict(color='white', width=0.5)))

# Create a subplot to add a KDE plot
fig2 = make_subplots(rows=2, cols=1, row_heights=[0.7, 0.3], vertical_spacing=0.05)

# Add the main histogram to the subplot
for trace in fig.data:
    fig2.add_trace(trace, row=1, col=1)

# Calculate KDE
kde_x = np.linspace(merged_data['Amount spent (INR)'].min(), merged_data['Amount spent (INR)'].max(), 100)
kde_y = stats.gaussian_kde(merged_data['Amount spent (INR)'].dropna())(kde_x)

# Add KDE plot
kde = go.Scatter(x=kde_x,
                 y=kde_y,
                 mode='lines',
                 name='KDE',
                 line=dict(color='#e74c3c', width=2))
fig2.add_trace(kde, row=1, col=1)

# Add summary statistics
summary_stats = merged_data['Amount spent (INR)'].describe()
stats_text = (f"Mean: ₹{summary_stats['mean']:,.0f}<br>"
              f"Median: ₹{summary_stats['50%']:,.0f}<br>"
              f"Std Dev: ₹{summary_stats['std']:,.0f}<br>"
              f"Max: ₹{summary_stats['max']:,.0f}")

fig2.add_annotation(
    text=stats_text,
    xref="paper", yref="paper",
    x=1.02, y=0.5,
    showarrow=False,
    bordercolor="black",
    borderwidth=1,
    borderpad=4,
    bgcolor="white",
    opacity=0.8
)

# Update layout
fig2.update_layout(
    title={
        'text': 'Distribution of Advertisement Expenditure',
        'font': {'size': 24},
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title='Ad Spend (INR)',
    yaxis_title='Frequency',
    bargap=0.1,
    width=1000,
    height=800,
    showlegend=True,
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

# Update y-axis of the box plot
fig2.update_yaxes(title_text="", row=2, col=1)

# Show the figure
fig2.show()

The histogram indicates that most constituencies have ad spends clustered around the 50M and 100M INR marks, with fewer constituencies spending less than 10M INR or more than 150M INR. The box plot highlights that the median ad spend is around 70M INR, with the interquartile range (IQR) spanning from approximately 30M to 110M INR. There are a few outliers, particularly a constituency with an exceptionally high ad spend above 150M INR. This distribution suggests that while the majority of ad spends are concentrated within a certain range, there are notable exceptions with significantly higher expenditures.

Now, let’s analyze ad spending and voter turnout by election phase:

In [14]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Data preparation
phase_analysis = merged_data.groupby('Phase').agg({
    'Amount spent (INR)': 'sum',
    'Polled (%)': 'mean'
}).reset_index()

# Sort phases numerically
phase_analysis['Phase'] = pd.to_numeric(phase_analysis['Phase'])
phase_analysis = phase_analysis.sort_values('Phase')

# Create subplots
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add bar chart for Ad Spend
fig.add_trace(
    go.Bar(
        x=phase_analysis['Phase'],
        y=phase_analysis['Amount spent (INR)'],
        name='Ad Spend (INR)',
        marker_color='#3498db',
        opacity=0.7,
        hovertemplate='<b>Phase %{x}</b><br>Ad Spend: ₹%{y:,.0f}<extra></extra>'
    ),
    secondary_y=False,
)

# Add line chart for Voter Turnout
fig.add_trace(
    go.Scatter(
        x=phase_analysis['Phase'],
        y=phase_analysis['Polled (%)'],
        name='Voter Turnout (%)',
        marker_color='#e74c3c',
        mode='lines+markers',
        line=dict(width=3),
        marker=dict(size=8),
        hovertemplate='<b>Phase %{x}</b><br>Turnout: %{y:.2f}%<extra></extra>'
    ),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title={
        'text': 'Advertisement Expenditure and Voter Turnout by Election Phase',
        'font': {'size': 24},
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis=dict(
        title='Election Phase',
        tickmode='linear',
        tick0=1,
        dtick=1,
        tickfont=dict(size=12)
    ),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    ),
    hovermode='x unified',
    width=1000,
    height=700,
    template='plotly_white'
)

# Update y-axes
fig.update_yaxes(
    title_text="Ad Spend (INR)",
    secondary_y=False,
    tickformat=',.0f',
    tickprefix='₹',
    showgrid=True
)
fig.update_yaxes(
    title_text="Voter Turnout (%)",
    secondary_y=True,
    tickformat='.1f',
    ticksuffix='%',
    showgrid=False
)

# Show the figure
fig.show()

There is no consistent trend between ad spend and voter turnout. Election phases 1 and 4 have the highest ad spends, with phase 4 peaking in voter turnout at around 70%. However, phase 1, despite high ad spend, has a lower voter turnout of about 67%. Phases with moderate ad spend (e.g., 2 and 6) have lower voter turnout, while phase 5 has a notably low turnout despite moderate spending.

Conclusion:
Overall, the analyses indicate that higher ad spend does not guarantee higher voter turnout and voter engagement is influenced by various other factors. Larger and more significant states tend to spend more on ads, but this does not necessarily translate to higher voter participation. Political parties, particularly the BJP, invest heavily in advertising, yet the effectiveness of this spending in increasing voter turnout is questionable.