In [101]:
import pandas as pd
import numpy as np

In [102]:
parliament = pd.read_csv('women_representation_1997_2025.csv')
parliament.head()

Unnamed: 0,Year,Month,Country,Country Code,Region,Subregion,Lower Total Seat,Lower Total Women,Lower Chamber Women (%),Upper Total Seat,Upper Total Women,Upper Chamber Women (%),Rank
0,1997,1,Sweden,SE,europe,nordic_countries,349.0,141.0,40.4,,,,1.0
1,1997,1,Norway,NO,europe,nordic_countries,165.0,65.0,39.4,,,,2.0
2,1997,1,Denmark,DK,europe,nordic_countries,179.0,60.0,33.5,,,,3.0
3,1997,1,Finland,FI,europe,nordic_countries,200.0,67.0,33.5,,,,3.0
4,1997,1,Netherlands,NL,europe,western_europe,150.0,49.0,32.7,75.0,17.0,22.7,5.0


In [103]:
parliament.shape

(65234, 13)

In [104]:
parliament_meta = pd.read_csv('ipu_wip_meta_data_1997_2025.csv')
parliament_meta.head()

  parliament_meta = pd.read_csv('ipu_wip_meta_data_1997_2025.csv')


Unnamed: 0,Year,Month,Country,Suspended Parliament,Structure of Parliament,Region,IPU Membership,Population in thousands,Men Number,Women Number,Women Percent,Political System,subregion,Transitional Status,gender_equality,themes,gender_quota
0,1997,1,Afghanistan,False,bicameral,asia,False,,,,,,south_asia,False,yes,,No
1,1997,1,Afghanistan,False,bicameral,asia,False,,,,,,south_asia,False,yes,,No
2,1997,1,Afghanistan,False,bicameral,asia,False,,,,,,south_asia,False,yes,,No
3,1997,1,Afghanistan,False,bicameral,asia,False,,,,,,south_asia,False,yes,,No
4,1997,1,Afghanistan,False,bicameral,asia,False,,,,,,south_asia,False,yes,,No


In [105]:
list1 = parliament['Country'].unique()
list2 = parliament_meta['Country'].unique()
missing_lsit1 = set(list1) - set(list2)
missing_lsit2 = set(list2) - set(list1)

print(missing_lsit1)
print(missing_lsit2)

{'Brunei Darussalam', 'Antigua and Barbuda', 'Yemen', 'Venezuela (Bolivarian Republic of)', 'Saint Kitts and Nevis', 'Guinea-Bissau', 'Saint Lucia', 'Eritrea'}
{'Swaziland', 'The former Yugoslav Republic of Macedonia', 'Czech Republic', 'Turkey'}


In [106]:
country_map = {
    'Swaziland' : 'Switzerland',
    'The former Yugoslav Republic of Macedonia' : 'North Macedonia',
    'Czech Republic': 'Czechia',
}
parliament_meta['Country'].replace(country_map, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  parliament_meta['Country'].replace(country_map, inplace=True)


In [107]:
parliament_merged = parliament.merge(parliament_meta, on=['Year','Month','Country'],how='left', suffixes=("","_meta"))
parliament_merged.shape

(423564, 27)

In [108]:
# to ensure you keep the most informative row when duplicates exist
parliament_merged['non_null_count'] = parliament_merged.notnull().sum(axis=1)
parliament_merged = parliament_merged.sort_values(by=['Year', 'Month', 'Country', 'non_null_count'], ascending=[True, True, True, False])
parliament_merged = parliament_merged.drop_duplicates(subset=['Year', 'Month', 'Country'], keep='first')
parliament_merged = parliament_merged.drop(columns=['non_null_count'])  # Remove temp column

In [109]:
parliament_merged.shape

(65234, 27)

In [146]:
def fillna_with_mode(columns_to_fill, merged_df):
    
    # Step 1: Group data by Year and Country to identify fully missing years
    missing_years = (
        merged_df.groupby(["Year", "Country"])[columns_to_fill]
        .apply(lambda group: group.isna().all(axis=0).any())  # True if any column is fully NaN for the year and country
        .reset_index(name="Whole_Year_Missing")
    )

    # Step 2: Fill partial missing values for the same year and country
    for col in columns_to_fill:
        merged_df[col] = (
            merged_df.groupby(["Year", "Country"])[col]
            .transform(lambda group: group.fillna(group.mode().iloc[0] if not group.mode().empty else np.nan))
        )

    # Step 3: Fill completely missing years with mode for the country (ignoring year)
    for index, row in missing_years.iterrows():
        year, country, whole_year_missing = row["Year"], row["Country"], row["Whole_Year_Missing"]
        
        if whole_year_missing:
            # Get mode for the country across all years
            country_mode_df = merged_df[merged_df["Country"] == country][columns_to_fill].mode()

            if not country_mode_df.empty:  # Check if mode exists
                country_mode = country_mode_df.iloc[0]  # First mode if multiple
                
                # Fill missing values for the entire year
                merged_df.loc[
                    (merged_df["Year"] == year) & (merged_df["Country"] == country),
                    columns_to_fill
                ] = merged_df.loc[
                    (merged_df["Year"] == year) & (merged_df["Country"] == country),
                    columns_to_fill
                ].fillna(country_mode)

    return merged_df

In [121]:
parliament_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 65234 entries, 996 to 422814
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Year                     65234 non-null  int64  
 1   Month                    65234 non-null  int64  
 2   Country                  65234 non-null  object 
 3   Country Code             64896 non-null  object 
 4   Region                   65234 non-null  object 
 5   Subregion                65234 non-null  object 
 6   Lower Total Seat         63567 non-null  float64
 7   Lower Total Women        63298 non-null  float64
 8   Lower Chamber Women (%)  63298 non-null  float64
 9   Upper Total Seat         24381 non-null  float64
 10  Upper Total Women        24373 non-null  float64
 11  Upper Chamber Women (%)  24373 non-null  float64
 12  Rank                     64555 non-null  float64
 13  Suspended Parliament     61146 non-null  object 
 14  Structure of Parliament 

In [122]:
#fill the missing value for lower chamber data for both unicameral and bicameral since all countries should have lower data
columns_to_fill = ['Lower Total Seat','Lower Total Women', 'Lower Chamber Women (%)']
parliament_merged = fillna_with_mode(columns_to_fill=columns_to_fill, merged_df=parliament_merged)

# Check the number of missing values for each column after filling
for col in columns_to_fill:
    print(f"Missing values in '{col}': {parliament_merged[col].isna().sum()}")

Missing values in 'Lower Total Seat': 0
Missing values in 'Lower Total Women': 0
Missing values in 'Lower Chamber Women (%)': 0


In [123]:
# Filter bicameral countries with missing Upper Chamber Women (%) data
bicameral_missing = parliament_merged[
    (parliament_merged['Structure of Parliament'] == 'bicameral') &
    (parliament_merged['Upper Chamber Women (%)'].isna())]
bicameral_upper_missing_data= bicameral_missing['Country'].unique()


# Filter bicameral countries with missing Upper Chamber Women (%) data
bicameral_missing_lower = parliament_merged[
    (parliament_merged['Structure of Parliament'] == 'bicameral') &
    (parliament_merged['Lower Chamber Women (%)'].isna())]
bicamerl_lower_missing_data = bicameral_missing_lower['Country'].unique()

# Filter unicameral countries with missing Upper Chamber Women (%) data
unicameral_missing_lower = parliament_merged[
    (parliament_merged['Structure of Parliament'] == 'unicameral') &
    (parliament_merged['Lower Chamber Women (%)'].isna())]
unicameral_lower_missing_data = unicameral_missing_lower['Country'].unique()


In [124]:
"""
For unicameral country ,there should not have upper data, so fill with 0
0  represents "no upper chamber" rather than true missingness

For bicameral country, fill missing upper data with respective year,month data 
"""
# Step 1: Filter merged_df to only include rows for the bicameral countries in the list
filtered_df = parliament_merged[parliament_merged['Country'].isin(bicameral_upper_missing_data)]

# Step 2: Apply the fillna_with_mode function for the desired columns
columns_to_fill = ['Upper Total Seat','Upper Total Women','Upper Chamber Women (%)']

# Apply the function to the filtered dataframe
filtered_df = fillna_with_mode(columns_to_fill=columns_to_fill, merged_df=filtered_df)

# Step 3: Update the original merged_df with the filled data from filtered_df
parliament_merged.update(filtered_df)

# Verify the missing values are filled
for col in columns_to_fill:
    print(f"Missing values in '{col}': {parliament_merged[col].isna().sum()}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[col] = (


Missing values in 'Upper Total Seat': 36894
Missing values in 'Upper Total Women': 36894
Missing values in 'Upper Chamber Women (%)': 36894


In [125]:
parliament_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 65234 entries, 996 to 422814
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Year                     65234 non-null  int64  
 1   Month                    65234 non-null  int64  
 2   Country                  65234 non-null  object 
 3   Country Code             64896 non-null  object 
 4   Region                   65234 non-null  object 
 5   Subregion                65234 non-null  object 
 6   Lower Total Seat         65234 non-null  float64
 7   Lower Total Women        65234 non-null  float64
 8   Lower Chamber Women (%)  65234 non-null  float64
 9   Upper Total Seat         28340 non-null  float64
 10  Upper Total Women        28340 non-null  float64
 11  Upper Chamber Women (%)  28340 non-null  float64
 12  Rank                     64555 non-null  float64
 13  Suspended Parliament     61146 non-null  object 
 14  Structure of Parliament 

In [149]:
meta_col_to_fill = ["Political System","Structure of Parliament","gender_equality","themes","gender_quota"]

for col in meta_col_to_fill:
    parliament_merged = fillna_with_mode(columns_to_fill=[col], merged_df=parliament_merged)
    #print(parliament_merged["Political System"].isna().sum())  # Should return 0 if all missing values are filled

  .transform(lambda group: group.fillna(group.mode().iloc[0] if not group.mode().empty else np.nan))
  .transform(lambda group: group.fillna(group.mode().iloc[0] if not group.mode().empty else np.nan))
  .transform(lambda group: group.fillna(group.mode().iloc[0] if not group.mode().empty else np.nan))
  .transform(lambda group: group.fillna(group.mode().iloc[0] if not group.mode().empty else np.nan))
  .transform(lambda group: group.fillna(group.mode().iloc[0] if not group.mode().empty else np.nan))


In [151]:
missing_par_struct = parliament_merged[parliament_merged["Structure of Parliament"].isna()][['Country']]
print(missing_par_struct['Country'].unique())

['Antigua and Barbuda' 'Brunei Darussalam' 'Eritrea' 'Guinea-Bissau'
 'Saint Kitts and Nevis' 'Saint Lucia'
 'Venezuela (Bolivarian Republic of)' 'Yemen']


In [152]:
parliament_merged = parliament_merged.drop(columns=['Population in thousands','Men Number','Women Number','Women Percent'])

In [154]:
wip = parliament_merged.copy()

In [155]:
pvtw = pd.read_csv('ACLED_DATA.csv')
pvtw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78112 entries, 0 to 78111
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   event_id_cnty       78112 non-null  object 
 1   event_date          78112 non-null  object 
 2   year                78112 non-null  int64  
 3   time_precision      78112 non-null  int64  
 4   disorder_type       78112 non-null  object 
 5   event_type          78112 non-null  object 
 6   sub_event_type      78112 non-null  object 
 7   actor1              78112 non-null  object 
 8   assoc_actor_1       57339 non-null  object 
 9   inter1              78112 non-null  object 
 10  actor2              28030 non-null  object 
 11  assoc_actor_2       24830 non-null  object 
 12  inter2              28030 non-null  object 
 13  interaction         78112 non-null  object 
 14  civilian_targeting  22229 non-null  object 
 15  iso                 78112 non-null  int64  
 16  regi

In [156]:
total_country_wip = len(wip['Country'].value_counts().index)
total_country_pvtw = len(pvtw['country'].value_counts().index)
print(f"total number of countries in women in parliament: {total_country_wip}")
print(f"total number of countries in PVTW: {total_country_pvtw}")

total number of countries in women in parliament: 193
total number of countries in PVTW: 194


In [157]:
# Standardize country names to title case
wip['Country'] = wip['Country'].str.title()
pvtw['country'] = pvtw['country'].str.title()

# Get unique country names from both datasets
countries_wip = set(wip['Country'].unique())
countries_pvtw = set(pvtw['country'].unique())

# Find differences
only_in_wip = countries_wip - countries_pvtw
only_in_pvtw = countries_pvtw - countries_wip

# Print results
print("total missing countries  in WIP but not in PVTW", len(only_in_wip))
print("Countries in WIP but not in PVTW:", only_in_wip)
print("total missing countries  in PVTW but not in WIP", len(only_in_pvtw))
print("Countries in PVTW but not in WIP:", only_in_pvtw)

total missing countries  in WIP but not in PVTW 37
Countries in WIP but not in PVTW: {'Democratic Republic Of The Congo', 'Bolivia (Plurinational State Of)', 'Kiribati', 'United States Of America', 'Tuvalu', 'United Republic Of Tanzania', 'Seychelles', 'Saint Lucia', 'Saint Kitts And Nevis', 'Congo', 'Monaco', 'Cabo Verde', 'Marshall Islands', 'Micronesia (Federated States Of)', "Côte D'Ivoire", 'Tonga', 'Gambia (The)', 'Republic Of Korea', 'Viet Nam', 'Bhutan', 'Venezuela (Bolivarian Republic Of)', 'Russian Federation', 'Grenada', "Democratic People'S Republic Of Korea", 'Iran (Islamic Republic Of)', "Lao People'S Democratic Republic", 'Nauru', 'Brunei Darussalam', 'Türkiye', 'Saint Vincent And The Grenadines', 'Palau', 'Czechia', 'United Arab Emirates', 'Syrian Arab Republic', 'Oman', 'Timor-Leste', 'Republic Of Moldova'}
total missing countries  in PVTW but not in WIP 38
Countries in PVTW but not in WIP: {'Mayotte', 'Gambia', 'New Caledonia', 'French Polynesia', 'Sint Maarten', 'Pal

In [158]:
# Mapping of WIP country names to PVTW names (shorter versions)
# Standardize the countries name
country_mapping = {
    'Viet Nam': 'Vietnam',
    'Türkiye': 'Turkey',
    'Venezuela (Bolivarian Republic Of)': 'Venezuela',
    'Syrian Arab Republic': 'Syria',
    'Saint Kitts And Nevis': 'Saint Kitts and Nevis',
    'Democratic Republic Of The Congo': 'Democratic Republic Of Congo',
    'Timor-Leste': 'East Timor',
    'Saint Lucia': 'Saint Lucia',
    'Congo': 'Republic Of Congo',
    'Tuvalu': 'Tuvalu',
    'Seychelles': 'Seychelles',
    'Russian Federation': 'Russia',
    'United Arab Emirates': 'United Arab Emirates',
    "Democratic People'S Republic Of Korea": 'North Korea',
    'Gambia (The)': 'Gambia',
    "Côte D'Ivoire": 'Ivory Coast',
    'Bolivia (Plurinational State Of)': 'Bolivia',
    'Republic Of Moldova': 'Moldova',
    'Grenada': 'Grenada',
    'Iran (Islamic Republic Of)': 'Iran',
    'Republic Of Korea': 'South Korea',
    'Oman': 'Oman',
    'Tonga': 'Tonga',
    'Marshall Islands': 'Marshall Islands',
    'Nauru': 'Nauru',
    'Saint Vincent And The Grenadines': 'Saint Vincent and the Grenadines',
    'United Republic Of Tanzania': 'Tanzania',
    "Lao People'S Democratic Republic": 'Laos',
    'Bhutan': 'Bhutan',
    'Monaco': 'Monaco',
    'United States Of America': 'United States',
    'Cabo Verde': 'Cape Verde',
    'Micronesia (Federated States Of)': 'Micronesia',
    'Palau': 'Palau',
    'Czechia': 'Czech Republic',
    'Brunei Darussalam': 'Brunei'
}

# Apply renaming in WIP dataset
wip['Country'].replace(country_mapping, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  wip['Country'].replace(country_mapping, inplace=True)


In [159]:
# Recheck differences after renaming
countries_wip = set(wip['Country'].unique())
countries_pvtw = set(pvtw['country'].unique())

# Find remaining differences
only_in_wip = countries_wip - countries_pvtw
only_in_pvtw = countries_pvtw - countries_wip

# Print results
print(len(only_in_wip))
print("Remaining Countries in WIP but not in PVTW:", only_in_wip)
print(len(only_in_pvtw))
print("Remaining Countries in PVTW but not in WIP:", only_in_pvtw)

17
Remaining Countries in WIP but not in PVTW: {'Kiribati', 'Monaco', 'Nauru', 'Marshall Islands', 'Bhutan', 'Brunei', 'Saint Vincent and the Grenadines', 'Palau', 'Grenada', 'Tuvalu', 'Saint Kitts and Nevis', 'United Arab Emirates', 'Seychelles', 'Saint Lucia', 'Tonga', 'Micronesia', 'Oman'}
18
Remaining Countries in PVTW but not in WIP: {'Caribbean Netherlands', 'Guam', 'Guadeloupe', 'Mayotte', 'Martinique', 'Curacao', 'French Guiana', 'French Polynesia', 'New Caledonia', 'Puerto Rico', 'Sint Maarten', 'Palestine', 'Bailiwick Of Guernsey', 'Saint-Martin', 'Kosovo', 'Greenland', 'Reunion', 'Taiwan'}


✅ Countries in WIP but not in PVTW (18 total)
These are mostly small island nations and microstates:

Seychelles, Kiribati, United Arab Emirates, Saint Kitts and Nevis, Monaco, Oman, Micronesia, Tonga, Marshall Islands, Brunei, Nauru, Bhutan, Palau, Saint Lucia, Grenada, Tuvalu, Saint Vincent and the Grenadines
🔹 Possible Explanation:
These countries may not have recorded violent incidents in PVTW.

✅ Countries in PVTW but not in WIP (18 total)
These are mostly territories, regions, and disputed areas:

Reunion, Bailiwick Of Guernsey, Guam, Taiwan, Sint Maarten, French Polynesia, Martinique, Palestine, New Caledonia, Kosovo, French Guiana, Mayotte, Puerto Rico, Greenland, Caribbean Netherlands, Guadeloupe, Curacao, Saint-Martin
🔹 Possible Explanation:
WIP might only include recognized sovereign states, whereas PVTW includes territories and disputed regions.

In [160]:
pvtw['Month'] = pd.to_datetime(pvtw['event_date']).dt.month
pvtw['Month'].head()

0    11
1    11
2    11
3    11
4    11
Name: Month, dtype: int32

In [178]:
pvtw.columns

Index(['event_id_cnty', 'event_date', 'year', 'time_precision',
       'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'iso', 'region', 'country',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'tags', 'timestamp', 'Month'],
      dtype='object')

In [161]:
#Extract parliament structured data from 2019-2024
parliment_structure = pd.read_csv('cleaned_women_parliament_data.csv')

In [162]:
# Apply renaming in WIP dataset
parliment_structure['Country'].replace(country_mapping, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  parliment_structure['Country'].replace(country_mapping, inplace=True)


In [177]:
wip.columns

Index(['Year', 'Month', 'Country', 'Country Code', 'Region', 'Subregion',
       'Lower Total Seat', 'Lower Total Women', 'Lower Chamber Women (%)',
       'Upper Total Seat', 'Upper Total Women', 'Upper Chamber Women (%)',
       'Rank', 'Suspended Parliament', 'Structure of Parliament',
       'Region_meta', 'IPU Membership', 'Political System', 'subregion',
       'Transitional Status', 'gender_equality', 'themes', 'gender_quota'],
      dtype='object')

In [179]:
pvtw_merged = pvtw.merge(wip, left_on=['country','year','Month'], right_on=['Country','Year','Month'], how='left')
pvtw_merged.shape

(78112, 54)

In [180]:
pvtw_merged.columns

Index(['event_id_cnty', 'event_date', 'year', 'time_precision',
       'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'iso', 'region', 'country',
       'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude',
       'geo_precision', 'source', 'source_scale', 'notes', 'fatalities',
       'tags', 'timestamp', 'Month', 'Year', 'Country', 'Country Code',
       'Region', 'Subregion', 'Lower Total Seat', 'Lower Total Women',
       'Lower Chamber Women (%)', 'Upper Total Seat', 'Upper Total Women',
       'Upper Chamber Women (%)', 'Rank', 'Suspended Parliament',
       'Structure of Parliament', 'Region_meta', 'IPU Membership',
       'Political System', 'subregion', 'Transitional Status',
       'gender_equality', 'themes', 'gender_quota'],
      dtype='object')

In [181]:
col_to_check = ['Region_meta',
       'Subregion', 'Lower Total Seat', 'Lower Total Women',
       'Lower Chamber Women (%)', 'Upper Total Seat', 'Upper Total Women',
       'Upper Chamber Women (%)', 'Rank', 'Suspended Parliament',
       'Structure of Parliament', 'IPU Membership', 
       'Political System', 'Transitional Status', 'gender_equality', 'themes',
       'gender_quota']
pvtw_merged[col_to_check].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78112 entries, 0 to 78111
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Region_meta              73451 non-null  object 
 1   Subregion                77356 non-null  object 
 2   Lower Total Seat         77356 non-null  float64
 3   Lower Total Women        77356 non-null  float64
 4   Lower Chamber Women (%)  77356 non-null  float64
 5   Upper Total Seat         57507 non-null  float64
 6   Upper Total Women        57507 non-null  float64
 7   Upper Chamber Women (%)  57507 non-null  float64
 8   Rank                     74986 non-null  float64
 9   Suspended Parliament     73451 non-null  object 
 10  Structure of Parliament  75466 non-null  object 
 11  IPU Membership           73442 non-null  object 
 12  Political System         75466 non-null  object 
 13  Transitional Status      73429 non-null  object 
 14  gender_equality       

In [171]:
# # Define column groups
# cols_to_fill_structure = ['Upper Total Seat', 'Upper Total Women', 'Upper Chamber Women (%)', 'Rank', 
#                           'Political System', 'Structure of Parliament', 'Suspended Parliament',
#                           'Transitional Status', 'IPU Membership','gender_equality', 'themes', 'gender_quota']

# # Merge with reference data on month, year, and country
# pvtw_merged = pvtw_merged.merge(parliment_structure, on=['Month', 'Year', 'Country'], how='left', suffixes=('', '_struct'))

# # Fill missing values for 2019-2025 using structure data
# for col in cols_to_fill_structure:
#     mask_stru = (pvtw_merged['year'].between(2019, 2025)) & (pvtw_merged[col].isna())
#     pvtw_merged.loc[mask_stru, col] = pvtw_merged.loc[mask_stru, f"{col}_struct"]

# # Drop auxiliary columns
# pvtw_merged.drop(columns=[f"{col}_struct" for col in cols_to_fill_structure if f"{col}_struct" in pvtw_merged.columns], inplace=True)

In [182]:
pvtw_merged .drop(['year', 'Country', 'Country Code','subregion'], axis=1, inplace=True)

In [183]:
pvtw_merged.columns

Index(['event_id_cnty', 'event_date', 'time_precision', 'disorder_type',
       'event_type', 'sub_event_type', 'actor1', 'assoc_actor_1', 'inter1',
       'actor2', 'assoc_actor_2', 'inter2', 'interaction',
       'civilian_targeting', 'iso', 'region', 'country', 'admin1', 'admin2',
       'admin3', 'location', 'latitude', 'longitude', 'geo_precision',
       'source', 'source_scale', 'notes', 'fatalities', 'tags', 'timestamp',
       'Month', 'Year', 'Region', 'Subregion', 'Lower Total Seat',
       'Lower Total Women', 'Lower Chamber Women (%)', 'Upper Total Seat',
       'Upper Total Women', 'Upper Chamber Women (%)', 'Rank',
       'Suspended Parliament', 'Structure of Parliament', 'Region_meta',
       'IPU Membership', 'Political System', 'Transitional Status',
       'gender_equality', 'themes', 'gender_quota'],
      dtype='object')

In [184]:
pvtw_merged['Subregion'].value_counts()

Subregion
south_asia                    15631
north_america                 13652
south_america                  7357
southern_europe                6468
east_asia                      5060
middle_east                    3619
western_europe                 3486
central_and_eastern_europe     3275
central_africa                 2998
south_east_asia                2922
west_africa                    2688
east_africa                    2099
north_africa                   1818
southern_africa                1772
central_america                1529
nordic_countries               1235
caribbean                       852
central_asia                    586
australia_and_new_zealand       214
pacific_islands                  95
Name: count, dtype: int64

In [185]:
pvtw_merged ['region'].value_counts()

region
North America                13652
South Asia                   12669
Europe                       10943
Middle East                   9477
South America                 7361
East Asia                     5248
Eastern Africa                3052
Southeast Asia                2922
Western Africa                2763
Middle Africa                 2514
Northern Africa               1905
Central America               1529
Caucasus and Central Asia     1440
Southern Africa               1325
Caribbean                      987
Oceania                        325
Name: count, dtype: int64

In [186]:
#Drop subregion from wip
pvtw_merged.drop(['Subregion'], axis=1, inplace=True)

In [187]:
#rename region to subregion in pvtw
pvtw_merged.rename(columns={'region':'Subregion'}, inplace=True)

In [188]:
missing_regions = pvtw_merged[pvtw_merged['Region'].isna()][['country', 'Subregion', 'Region']]
print(missing_regions['country'].unique())

['Palestine' 'Taiwan' 'Reunion' 'Kosovo' 'French Guiana' 'Puerto Rico'
 'Sint Maarten' 'Mayotte' 'Bailiwick Of Guernsey' 'Greenland'
 'New Caledonia' 'Martinique' 'Caribbean Netherlands' 'French Polynesia'
 'Guam' 'Saint-Martin' 'Guadeloupe' 'Curacao']


In [189]:
missing_seat = pvtw_merged[pvtw_merged['Lower Total Seat'].isna()][['country', 'Subregion', 'Region']]
print(missing_seat['country'].unique())

['Palestine' 'Taiwan' 'Reunion' 'Kosovo' 'French Guiana' 'Puerto Rico'
 'Sint Maarten' 'Mayotte' 'Bailiwick Of Guernsey' 'Greenland'
 'New Caledonia' 'Martinique' 'Caribbean Netherlands' 'French Polynesia'
 'Guam' 'Saint-Martin' 'Guadeloupe' 'Curacao']


In [190]:
region_mapping = {
    'Palestine': 'middle_east_and_north_africa',
    'Taiwan': 'asia',
    'Reunion': 'africa',  # French overseas territory, but geographically in Africa
    'Kosovo': 'europe',
    'French Guiana': 'americas',
    'Puerto Rico': 'americas',
    'Sint Maarten': 'americas',
    'Mayotte': 'africa',  # French overseas territory near Madagascar
    'Bailiwick Of Guernsey': 'europe',
    'Greenland': 'europe',  # Politically part of Denmark
    'New Caledonia': 'pacific',
    'Martinique': 'americas',
    'Caribbean Netherlands': 'americas',
    'French Polynesia': 'pacific',
    'Guam': 'pacific',
    'Saint-Martin': 'americas',
    'Guadeloupe': 'americas',
    'Curacao': 'americas'
}
pvtw_merged['Region'] = pvtw_merged['Region'].fillna(pvtw_merged['country'].map(region_mapping))

In [191]:
pvtw_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78112 entries, 0 to 78111
Data columns (total 49 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   event_id_cnty            78112 non-null  object 
 1   event_date               78112 non-null  object 
 2   time_precision           78112 non-null  int64  
 3   disorder_type            78112 non-null  object 
 4   event_type               78112 non-null  object 
 5   sub_event_type           78112 non-null  object 
 6   actor1                   78112 non-null  object 
 7   assoc_actor_1            57339 non-null  object 
 8   inter1                   78112 non-null  object 
 9   actor2                   28030 non-null  object 
 10  assoc_actor_2            24830 non-null  object 
 11  inter2                   28030 non-null  object 
 12  interaction              78112 non-null  object 
 13  civilian_targeting       22229 non-null  object 
 14  iso                   

In [192]:
pvtw_merged['event_date'] = pd.to_datetime(pvtw_merged['event_date'] )

In [193]:
pvtw_merged['Region'] = pvtw_merged['Region'].str.title()

In [194]:
pvtw_merged['Region'].value_counts()

Region
Asia                            24387
Americas                        23529
Europe                          14539
Sub_Saharan_Africa               9557
Middle_East_And_North_Africa     5753
Pacific                           325
Africa                             22
Name: count, dtype: int64

In [195]:
pvtw_merged.to_csv('pvtw_merged_with_parliament_1997_2024.csv')

['Palestine' 'Taiwan' 'Reunion' 'Kosovo' 'French Guiana' 'Puerto Rico'
 'Sint Maarten' 'Mayotte' 'Bailiwick Of Guernsey' 'Greenland'
 'New Caledonia' 'Martinique' 'Caribbean Netherlands' 'French Polynesia'
 'Guam' 'Saint-Martin' 'Guadeloupe' 'Curacao']

 These are mostly territories, regions, and disputed areas. These countries do not have parliament data in IPU. So, For the hypothesis, these countries will filter out.

WIP might only include recognized sovereign states, whereas PVTW includes territories and disputed regions.