**DATA PREPROCESSING**




### Analyzing and cleaning data


In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [3]:
# Use pd.read_excel to read the Excel file
df = pd.read_excel("/content/province_cleaned_final.xlsx")

In [4]:
df.drop("Unnamed: 0", inplace=True, axis=1)

In [5]:
df.head()


Unnamed: 0,Year,Islamabad,Punjab,Sindh,KPK,Balochistan,Azad Kashmir,Northern Areas,Tribal Areas,Total
0,1981,23,75506,22800,38485,1755,12094,4,2414,153081
1,1982,65,71401,22571,29697,1461,8945,6,3389,137535
2,1983,245,67731,18997,18035,971,9822,7,4223,120031
3,1984,82,48029,17202,16147,1067,8321,1,2691,93540
4,1985,215,40643,12259,17725,724,5011,18,5738,82333


In [6]:
# df.drop(45,inplace=True,axis=0) ---dropped a null value
df.tail()

Unnamed: 0,Year,Islamabad,Punjab,Sindh,KPK,Balochistan,Azad Kashmir,Northern Areas,Tribal Areas,Total
40,2021,2285,157699,21501,76484,2473,10705,1095,16038,288280
41,2022,6554,460302,59337,225272,8018,29512,1180,42164,832339
42,2023,10359,489301,72382,210150,8369,33904,1551,36609,862625
43,2024,8621,404345,60424,187103,5668,29591,1692,29937,727381
44,2025,1378,59502,9112,32325,859,4676,291,5446,113589


Data is sorted chronologically.

In [7]:
df.columns

Index(['Year', 'Islamabad', 'Punjab', 'Sindh', 'KPK', 'Balochistan',
       'Azad Kashmir', 'Northern Areas', 'Tribal Areas', 'Total'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Year            45 non-null     int64
 1   Islamabad       45 non-null     int64
 2   Punjab          45 non-null     int64
 3   Sindh           45 non-null     int64
 4   KPK             45 non-null     int64
 5   Balochistan     45 non-null     int64
 6   Azad Kashmir    45 non-null     int64
 7   Northern Areas  45 non-null     int64
 8   Tribal Areas    45 non-null     int64
 9   Total           45 non-null     int64
dtypes: int64(10)
memory usage: 3.6 KB


In [9]:
df.describe()

Unnamed: 0,Year,Islamabad,Punjab,Sindh,KPK,Balochistan,Azad Kashmir,Northern Areas,Tribal Areas,Total
count,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0
mean,2003.0,2443.244444,157015.511111,27906.644444,77326.422222,3167.155556,17797.333333,668.622222,14938.511111,301263.444444
std,13.133926,3072.930302,139388.552206,25956.7838,66127.384538,2392.587046,14823.350452,943.846465,12461.09805,260853.963265
min,1981.0,23.0,28362.0,6373.0,14575.0,389.0,3573.0,1.0,2414.0,58002.0
25%,1992.0,536.0,59769.0,10607.0,27695.0,1441.0,7691.0,19.0,5655.0,113781.0
50%,2003.0,933.0,81193.0,15461.0,42946.0,2127.0,10352.0,256.0,9030.0,154529.0
75%,2014.0,2471.0,199692.0,40171.0,114633.0,4528.0,30158.0,780.0,21831.0,430314.0
max,2025.0,10359.0,489301.0,116935.0,225272.0,9293.0,64586.0,3417.0,45798.0,946571.0


In [10]:
df.dtypes

Unnamed: 0,0
Year,int64
Islamabad,int64
Punjab,int64
Sindh,int64
KPK,int64
Balochistan,int64
Azad Kashmir,int64
Northern Areas,int64
Tribal Areas,int64
Total,int64


**Checking Null Rows.**

In [11]:
#Checking any null values in dataset
df.isnull().sum()

Unnamed: 0,0
Year,0
Islamabad,0
Punjab,0
Sindh,0
KPK,0
Balochistan,0
Azad Kashmir,0
Northern Areas,0
Tribal Areas,0
Total,0


In [12]:
#Size of dataset
df.shape

(45, 10)

Till here dataframe is already clean.

Data Cleaning and Data Analysis is complete at this stage.

## **Data Visualization:**

In [13]:

# Remove 'Total' for regional comparisons
df_regions = df.drop(columns=['Total'])

# Melt data for easier plotting
df_melted = df_regions.melt(id_vars='Year', value_vars=['Islamabad', 'Punjab', 'Sindh', 'KPK', 'Balochistan',
                                                       'Azad Kashmir', 'Northern Areas', 'Tribal Areas'],
                            var_name='Region', value_name='Population')

# 1. Scatter Plot: Population values by year for each region
fig1 = px.scatter(df_melted, x='Year', y='Population', color='Region',
                  title='Population Trends by Region in Pakistan (1981–1985)',
                  labels={'Population': 'Population Count'},
                  template='plotly_dark')
fig1.update_layout(title_x=0.5)
fig1.show()


In [14]:
import plotly.graph_objects as go

# Get unique years
years = df['Year'].unique()

# Create data and dropdown buttons
data = []
buttons = []

for i, year in enumerate(years):
    df_year = df[df['Year'] == year][['Islamabad', 'Punjab', 'Sindh', 'KPK', 'Balochistan',
                                      'Azad Kashmir', 'Northern Areas', 'Tribal Areas']].T
    df_year.columns = ['Population']
    df_year['Region'] = df_year.index

    # Create pie trace
    trace = go.Pie(labels=df_year['Region'],
                   values=df_year['Population'],
                   name=str(year),
                   visible=(i == 0))  # Show only the first year initially
    data.append(trace)

    # Create dropdown button
    buttons.append(dict(label=str(year),
                        method='update',
                        args=[{'visible': [j == i for j in range(len(years))]},
                              {'title': f'Regional Population Shares in Pakistan ({year})'}]))

# Create layout with dropdown centered above chart
layout = go.Layout(
    title='Regional Population Shares in Pakistan (1981)',
    template='plotly_dark',
    updatemenus=[dict(
        buttons=buttons,
        direction='down',
        showactive=True,
        x=0.5,           # Center horizontally
        xanchor='center',
        y=1.11,          # Above the chart
        yanchor='top'
    )]
)

fig = go.Figure(data=data, layout=layout)
fig.update_layout(title_x=0.5)
fig.show()


In [15]:

# 3. Normalized Stacked Bar Plot: Relative contribution to Total over time
df_normalized = df_regions.copy()
for col in ['Islamabad', 'Punjab', 'Sindh', 'KPK', 'Balochistan', 'Azad Kashmir', 'Northern Areas', 'Tribal Areas']:
    df_normalized[col] = df_normalized[col] / df['Total'] * 100
df_normalized_melted = df_normalized.melt(id_vars='Year', value_vars=['Islamabad', 'Punjab', 'Sindh', 'KPK',
                                                                     'Balochistan', 'Azad Kashmir', 'Northern Areas',
                                                                     'Tribal Areas'],
                                          var_name='Region', value_name='Percentage')
fig3 = px.bar(df_normalized_melted, x='Year', y='Percentage', color='Region', barmode='stack',
              title='Normalized Regional Contributions to Total Population (1981–2025)',
              labels={'Percentage': 'Percentage of Total Population (%)'},
              template='plotly_dark')
fig3.update_layout(title_x=0.5)
fig3.show()


In [16]:

# 4. Area Plot with Log Scale: Highlight smaller regions
fig4 = px.area(df_melted, x='Year', y='Population', color='Region',
               title='Population Trends by Region in Pakistan (Log Scale, 1981–2025)',
               labels={'Population': 'Population Count (Log Scale)'},
               template='plotly_dark')
fig4.update_layout(yaxis_type='log', title_x=0.5)
fig4.show()


In [17]:

# 5. Violin Plot: Distribution of population across regions
fig5 = px.violin(df_melted, x='Region', y='Population',
                 title='Population Distribution by Region in Pakistan (1981–2025)',
                 labels={'Population': 'Population Count'},
                 template='plotly_dark')
fig5.update_layout(title_x=0.5)
fig5.show()


In [18]:

# Previous Plots (Updated with Proper Titles)
# 6. Line Plot: Trends over time
fig6 = px.line(df_melted, x='Year', y='Population', color='Region',
               title='Population Trends by Region in Pakistan (1981–2025)',
               labels={'Population': 'Population Count'},
               template='plotly_dark')
fig6.update_layout(title_x=0.5)
fig6.show()


In [19]:

# 7. Stacked Area Plot: Regional contributions
fig7 = px.area(df_melted, x='Year', y='Population', color='Region',
               title='Stacked Area: Regional Population Contributions in Pakistan (1981–2025)',
               labels={'Population': 'Population Count'},
               template='plotly_dark')
fig7.update_layout(title_x=0.5)
fig7.show()


In [20]:

# 8. Bar Plot: Compare regions by year
fig8 = px.bar(df_melted, x='Year', y='Population', color='Region', barmode='group',
              title='Population Comparison by Region and Year in Pakistan (1981–2025)',
              labels={'Population': 'Population Count'},
              template='plotly_dark')
fig8.update_layout(title_x=0.5)
fig8.show()


In [21]:

# 9. Box Plot: Distribution of values
fig9 = px.box(df_melted, x='Region', y='Population',
              title='Population Distribution by Region in Pakistan (1981–2025)',
              labels={'Population': 'Population Count'},
              template='plotly_dark')
fig9.update_layout(title_x=0.5)
fig9.show()


In [22]:

# 10. Heatmap: Values across regions and years
df_heatmap = df.set_index('Year')[['Islamabad', 'Punjab', 'Sindh', 'KPK', 'Balochistan',
                                   'Azad Kashmir', 'Northern Areas', 'Tribal Areas']]
fig10 = px.imshow(df_heatmap.T,
                  title='Heatmap of Population by Region and Year in Pakistan (1981–2025)',
                  labels={'color': 'Population Count'},
                  template='plotly_dark')
fig10.update_layout(title_x=0.5, xaxis_title='Year', yaxis_title='Region')
fig10.show()

##**Modelling**

####**Model Development**

In [23]:
df_mc = df[['Year', 'Total']].copy()
df_mc.set_index('Year', inplace=True)

# Calculate annual percentage change
df_mc['Pct_Change'] = df_mc['Total'].pct_change()
historical_mean = df_mc['Pct_Change'].mean()
historical_std = df_mc['Pct_Change'].std()

# Monte Carlo parameters
num_simulations = 1000
num_years = 5
last_known_year = df_mc.index[-1]
last_known_value = df_mc['Total'].iloc[-1]

simulated_data = []

# Generate simulations
for _ in range(num_simulations):
    values = [last_known_value]
    for _ in range(num_years):
        growth_rate = np.random.normal(historical_mean, historical_std)
        next_value = values[-1] * (1 + growth_rate)
        values.append(next_value)
    simulated_data.append(values[1:])  # Exclude the known value

simulated_data = np.array(simulated_data)

# Compute statistics
forecast_years = list(range(last_known_year + 1, last_known_year + 1 + num_years))
mean_forecast = simulated_data.mean(axis=0)
lower_bound = np.percentile(simulated_data, 2.5, axis=0)
upper_bound = np.percentile(simulated_data, 97.5, axis=0)

# Prepare dataframe for plotting
df_forecast = pd.DataFrame({
    'Year': forecast_years,
    'Mean Forecast': mean_forecast,
    'Lower Bound (95%)': lower_bound,
    'Upper Bound (95%)': upper_bound
})

# Plot
fig = px.line(df_forecast, x='Year', y='Mean Forecast', title='Monte Carlo Forecast: Pakistan Total Population (2026–2030)',
              labels={'value': 'Population Count', 'Year': 'Year'}, template='plotly_dark')
fig.add_scatter(x=forecast_years, y=lower_bound, mode='lines', name='Lower Bound (95%)',
                line=dict(dash='dot', color='gray'))
fig.add_scatter(x=forecast_years, y=upper_bound, mode='lines', name='Upper Bound (95%)',
                line=dict(dash='dot', color='gray'))

# Add historical actuals
df_actual = df_mc.reset_index()
fig.add_scatter(x=df_actual['Year'], y=df_actual['Total'], mode='lines+markers', name='Actual')

fig.update_layout(title_x=0.5)
fig.show()