# Correlation analysis

## 1. Preprocessing

In [1]:
%%capture
%run preprocessing.ipynb

### 1.1 Select relevant data

In [2]:
# filter relevant timeframe
merged_df = merged_df.query('1995 <= year <= 2024')

# exclude irrelevant variables
merged_df = merged_df.drop(columns = ['f_unemployment', 
                            'm_unemployment', 
                            'Tertiary education (levels 5-8)',
                            'Upper secondary and post-secondary non-tertiary education (levels 3 and 4)'])

### 1.2 Combine with fragmentation and polarization index

In [3]:
# deduplicate: if in a year there were 2 elections, keep the first one
elections_dedup = elections.reset_index().sort_values('election_date').drop_duplicates(
    subset=['geo', 'election_year'], keep='first'
)

merged_df = merged_df.reset_index().sort_values('election_date').drop_duplicates(
    subset=['geo', 'year', 'party_code'], keep='first'
).set_index(['geo', 'year'])

# remove party 34020 (SYRIZA) for years >= 2012 (it merged into 34212)
merged_df = merged_df.reset_index()
merged_df = merged_df[~((merged_df['geo'] == 'GR') & 
                        (merged_df['year'] >= 2012) & 
                        (merged_df['party_code'] == 34020))]
merged_df = merged_df.set_index(['geo', 'year'])

df_correlations = pd.merge(
    left=merged_df.reset_index(),
    right=elections_dedup,
    how='left',
    left_on=['geo', 'year'],
    right_on=['geo', 'election_year']
).set_index(['geo', 'year'])

# df_correlations

### 1.3 Imputation

In [4]:
# imputation of missing values in demographic/socioeconomic variables
socioeco_vars = [
    'total_poverty', 'gdp_per_cap', 'consumer_prices','t_unemployment', 
    'dependency_ratio_15_64', 'debt_to_gdp',
    'Less than primary, primary and lower secondary education (levels 0-2)',
    'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)',
    'migration_inflow', 'median_age', 'population_total',
    'urban_population_pct', 'wealth_top10_share'
]

missings_check = df_correlations.groupby(['geo', 'year'])[socioeco_vars].mean()

for geo in sorted(missings_check.index.get_level_values('geo').unique()):
    subset = missings_check.loc[geo]
    missing_rows = subset[subset.isna().any(axis=1)]
    if len(missing_rows) > 0:
        print(f"\n{geo}:")
        for year in missing_rows.index:
            missing_vars = missing_rows.loc[year][missing_rows.loc[year].isna()].index.tolist()
            print(f"  {year}: {missing_vars}")


AT:
  1995: ['consumer_prices', 'migration_inflow']
  2000: ['Less than primary, primary and lower secondary education (levels 0-2)', 'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)']
  2001: ['Less than primary, primary and lower secondary education (levels 0-2)', 'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)']
  2002: ['total_poverty', 'Less than primary, primary and lower secondary education (levels 0-2)', 'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)']
  2003: ['Less than primary, primary and lower secondary education (levels 0-2)', 'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)']
  2024: ['gdp_per_cap', 'migration_inflow', 'urban_population_pct']

BE:
  1995: ['consumer_prices']
  2002: ['total_poverty']
  2008: ['migration_inflow']
  2009: ['migration_inflow']
  2024: ['gdp_per_cap', 'migration_inflow', 'urban_population_pct']

BG:
  1995: 

In [5]:
# create dataset only with country-year combinations
country_year_data = df_correlations.reset_index().groupby(['geo', 'year'])[socioeco_vars].first().reset_index()

# sort and interpolate at country level
country_year_data = country_year_data.sort_values(['geo', 'year'])

country_year_data[socioeco_vars] = country_year_data.groupby('geo')[socioeco_vars].transform(
    lambda x: x.interpolate(method='linear', limit_area='inside', limit=4)
)

# extrapolate
country_year_data[socioeco_vars] = country_year_data.groupby('geo')[socioeco_vars].transform(
    lambda x: x.interpolate(method='linear', limit_area='outside', limit=2, limit_direction='both')
)

# check missings
for geo in sorted(country_year_data['geo'].unique()):
    subset = country_year_data[country_year_data['geo'] == geo]
    missing_rows = subset[subset[socioeco_vars].isna().any(axis=1)]
    if len(missing_rows) > 0:
        print(f"\n{geo}:")
        for _, row in missing_rows.iterrows():
            missing_vars = [v for v in socioeco_vars if pd.isna(row[v])]
            print(f"  {int(row['year'])}: {missing_vars}")


BG:
  1995: ['total_poverty', 't_unemployment', 'Less than primary, primary and lower secondary education (levels 0-2)', 'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)', 'migration_inflow']
  1996: ['total_poverty', 't_unemployment', 'Less than primary, primary and lower secondary education (levels 0-2)', 'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)', 'migration_inflow']
  1997: ['total_poverty', 't_unemployment', 'Less than primary, primary and lower secondary education (levels 0-2)', 'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)', 'migration_inflow']
  1998: ['migration_inflow']
  1999: ['migration_inflow']
  2000: ['migration_inflow']
  2001: ['migration_inflow']
  2002: ['migration_inflow']
  2003: ['migration_inflow']
  2004: ['migration_inflow']

CY:
  1996: ['total_poverty', 't_unemployment', 'Less than primary, primary and lower secondary education (levels 0-2)', 'Upper 

In [6]:
# merge back
df_correlations = df_correlations.reset_index()
df_correlations = df_correlations.drop(columns=socioeco_vars)
df_correlations = pd.merge(df_correlations, country_year_data, on=['geo', 'year'], how='left')

### 1.4 Checks

In [7]:
# Plausibility of values
vote_sums = df_correlations.groupby(['geo', 'year'])['vote_share_percent'].sum().reset_index()
vote_sums.columns = ['geo', 'year', 'total_vote_share']

# Check if any are far from 100
print(vote_sums[vote_sums['total_vote_share'] > 100])  

# if empty, no duplicates

Empty DataFrame
Columns: [geo, year, total_vote_share]
Index: []


# 2. Correlation analysis

In [8]:
# ## aggregate df to party families

# # Which columns have to be summed
# sum_vars = ['vote_share_percent']  

# # Which columns are the same across parties (take first value)
# constant_vars = socioeco_vars + ['election_date', 'is_election', 'vs_AGR', 'vs_CHR', 'vs_CON',
#        'vs_DIV', 'vs_ECO', 'vs_ETH', 'vs_LEF', 'vs_LIB', 'vs_NAT', 'vs_SIP',
#        'vs_SOC'] 

# # Group by country, year, and party family
# df_parfam = df_correlations.groupby(['geo', 'year', 'parfam_code', 'parfam_label']).agg(
#     {**{var: 'sum' for var in sum_vars},
#      **{var: 'first' for var in constant_vars if var in df_correlations.columns}}
# ).reset_index()

# df_parfam.head(20)

In [9]:
df_correlations.columns

Index(['geo', 'year', 'country', 'election_date_x', 'party_name',
       'vote_share_percent', 'party_code', 'parfam', 'right_left_position',
       'planned_economy', 'welfare_state', 'market_economy', 'inter_positive',
       'democracy', 'sustainability_positive', 'parfam_code', 'parfam_label',
       'election_year', 'election_date_y', 'vs_AGR', 'vs_CHR', 'vs_CON',
       'vs_DIV', 'vs_ECO', 'vs_ETH', 'vs_LEF', 'vs_LIB', 'vs_NAT', 'vs_SIP',
       'vs_SOC', 'fragmentation_enp', 'polarization_dalton', 'total_poverty',
       'gdp_per_cap', 'consumer_prices', 't_unemployment',
       'dependency_ratio_15_64', 'debt_to_gdp',
       'Less than primary, primary and lower secondary education (levels 0-2)',
       'Upper secondary, post-secondary non-tertiary and tertiary education (levels 3-8)',
       'migration_inflow', 'median_age', 'population_total',
       'urban_population_pct', 'wealth_top10_share'],
      dtype='object')

In [10]:
df_parfam['parfam_code'].dropna().unique()

NameError: name 'df_parfam' is not defined

In [None]:
# Correlation by party family
corr_by_parfam = {}
df_parfam['parfam_code'].dropna().unique() + ['']

for parfam in df_correlations['parfam_code'].dropna().unique():
    subset = df_correlations[df_correlations['parfam_code'] == parfam]
    corr = subset[socioeco_vars + ['vote_share_percent']].corr()['vote_share_percent'].drop('vote_share_percent')
    corr_by_parfam[parfam] = corr

# Convert to DataFrame
corr_df = pd.DataFrame(corr_by_parfam).T
corr_df = corr_df.sort_index()