In [2]:
import pandas as pd
import os

In [91]:
import chardet 

In [105]:
import pycountry 

Define a function to read in and concat the regional election vote totals

In [56]:
def read_and_join(subfolder):
    notebook_dir = os.path.dirname(os.path.abspath('__file__'))
    data_folder_relative = os.path.join('..', '..', 'data', 'raw', subfolder)
    data_folder_absolute = os.path.abspath(data_folder_relative)
    
    excel_files = [file for file in os.listdir(data_folder_absolute) if file.endswith('.xlsx')]

    dataframes = []

    for excel_file in excel_files:
        excel_path = os.path.join(data_folder_absolute, excel_file)
        print(f"Reading {excel_path}")
        
        country_abbreviation = excel_file[:3] 
        
        df = pd.read_excel(excel_path, sheet_name='votes')  
        df['country_abbrev'] = country_abbreviation  
        dataframes.append(df)

    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        return combined_df
    else:
        print("No Excel files found or error reading files.")
        return None


Another to read in the party info for joining 

In [42]:
def read_and_join_meta(subfolder):
    notebook_dir = os.path.dirname(os.path.abspath('__file__'))
    data_folder_relative = os.path.join('..', '..', 'data', 'raw', subfolder)
    data_folder_absolute = os.path.abspath(data_folder_relative)
    
    excel_files = [file for file in os.listdir(data_folder_absolute) if file.endswith('.xlsx')]

    dataframes = []

    for excel_file in excel_files:
        excel_path = os.path.join(data_folder_absolute, excel_file)
        print(f"Reading {excel_path}")
        
        country_abbreviation = excel_file[:3]  # Extract the first three characters as abbreviation
        
        df = pd.read_excel(excel_path)
        df.columns = df.columns.str.lower()  # Convert column names to lowercase
        
        # Unify "party" and "party name" columns into a single column "party"
        if "party name" in df.columns:
            df["party"] = df["party name"]
            df = df.drop(columns=["party name"])
        
        df['country_abbrev'] = country_abbreviation  # Add the abbreviation as a new column
        dataframes.append(df)
    
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        return combined_df
    else:
        print("No Excel files found or error reading files.")
        return None


In [43]:
party_info = read_and_join_meta('regional_election')

Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\AUT_reg_1945-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\BEL_reg_1974-2014.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\DEN_reg_1946-2013.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\FRA_reg_1986-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\GER_reg_1946-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\ITA_reg_1947-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\POR_ACO_MAD_reg_1976-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\SPA_reg_1980-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\UKI_reg_1945-2012.xlsx


In [65]:
raw_reg = read_and_join('regional_election')

Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\AUT_reg_1945-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\BEL_reg_1974-2014.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\DEN_reg_1946-2013.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\FRA_reg_1986-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\GER_reg_1946-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\ITA_reg_1947-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\POR_ACO_MAD_reg_1976-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\SPA_reg_1980-2015.xlsx
Reading c:\Users\samtg\github\subnational_inequality\data\raw\regional_election\UKI_reg_1945-2012.xlsx


In [35]:
def add_single_to_dataframe(existing_df, new_excel_path):
    new_df = pd.read_excel(new_excel_path)
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    return combined_df

Pivot the vote df longer to join

In [66]:
to_drop = ['elec_day','elec_month','turnout','seats']

In [67]:
raw_reg = raw_reg.drop(columns = to_drop)

In [68]:
id_columns = ['country','country_abbrev','region', 'elec_year', 'electorate','valid','votes']

In [69]:
melted_df = pd.melt(raw_reg, id_vars=id_columns, var_name='party', value_name='vote_count')

In [47]:
party_info.head(5)

Unnamed: 0,abbreviation,notes,party,country_abbrev
0,AHS,,Aktive Heimat - Sozialisten,AUT
1,ALNO,,Alternative Liste NO,AUT
2,ASOK,,Allianz Soziales Kärnten,AUT
3,BCH,,Bill Clinton - Handy Borse Fanclub,AUT
4,BGO,,Burgerliche Grune Osterreich,AUT


In [76]:
melted_df = melted_df.rename(columns = {'party':'party_abbrev'})

In [81]:
melted_df

Unnamed: 0,country,country_abbrev,region,elec_year,electorate,valid,votes,party_abbrev,vote_count
0,Austria,AUT,Burgenland,1945,137222.0,131597.0,132262.0,AHS,
1,Austria,AUT,Burgenland,1949,169836.0,164398.0,165432.0,AHS,
2,Austria,AUT,Burgenland,1953,176396.0,167514.0,169787.0,AHS,
3,Austria,AUT,Burgenland,1956,175955.0,164950.0,167734.0,AHS,
4,Austria,AUT,Burgenland,1960,178255.0,162664.0,165183.0,AHS,
...,...,...,...,...,...,...,...,...,...
2943403,United Kingdom,UKI,Northern Ireland,1982,1048807.0,633120.0,653450.0,WP,17216.0
2943404,United Kingdom,UKI,Northern Ireland,1998,1178556.0,807683.0,824391.0,WP,1989.0
2943405,United Kingdom,UKI,Northern Ireland,2003,1097526.0,692028.0,702249.0,WP,1881.0
2943406,United Kingdom,UKI,Northern Ireland,2007,1107904.0,690313.0,696538.0,WP,975.0


Join the two dfs

In [78]:
joined_df = melted_df.merge(party_info, how = 'left', left_on = ['country_abbrev','party_abbrev'], right_on = ['country_abbrev','abbreviation'])

In [79]:
joined_df

Unnamed: 0,country,country_abbrev,region,elec_year,electorate,valid,votes,party_abbrev,vote_count,abbreviation,notes,party
0,Austria,AUT,Burgenland,1945,137222.0,131597.0,132262.0,AHS,,AHS,,Aktive Heimat - Sozialisten
1,Austria,AUT,Burgenland,1949,169836.0,164398.0,165432.0,AHS,,AHS,,Aktive Heimat - Sozialisten
2,Austria,AUT,Burgenland,1953,176396.0,167514.0,169787.0,AHS,,AHS,,Aktive Heimat - Sozialisten
3,Austria,AUT,Burgenland,1956,175955.0,164950.0,167734.0,AHS,,AHS,,Aktive Heimat - Sozialisten
4,Austria,AUT,Burgenland,1960,178255.0,162664.0,165183.0,AHS,,AHS,,Aktive Heimat - Sozialisten
...,...,...,...,...,...,...,...,...,...,...,...,...
2943403,United Kingdom,UKI,Northern Ireland,1982,1048807.0,633120.0,653450.0,WP,17216.0,WP,,Workers Party
2943404,United Kingdom,UKI,Northern Ireland,1998,1178556.0,807683.0,824391.0,WP,1989.0,WP,,Workers Party
2943405,United Kingdom,UKI,Northern Ireland,2003,1097526.0,692028.0,702249.0,WP,1881.0,WP,,Workers Party
2943406,United Kingdom,UKI,Northern Ireland,2007,1107904.0,690313.0,696538.0,WP,975.0,WP,,Workers Party


Join with Manifesto Data to get party family. 

In [84]:
crosswalk_relative = os.path.join('..', '..', 'data', 'raw', 'crosswalks')
crosswalks_folder_absolute = os.path.abspath(crosswalk_relative)

In [86]:
csv_file_name = 'ches_pfid_manifesto.csv'
csv_file_path = os.path.join(crosswalks_folder_absolute, csv_file_name)

In [93]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        return result['encoding']

crosswalk_relative = os.path.join('..', '..', 'data', 'raw', 'crosswalks')
crosswalks_folder_absolute = os.path.abspath(crosswalk_relative)

csv_file_name = 'ches_pfid_manifesto.csv'
csv_file_path = os.path.join(crosswalks_folder_absolute, csv_file_name)

detected_encoding = detect_encoding(csv_file_path)
print("Detected encoding:", detected_encoding)

try:
    crosswalk_df = pd.read_csv(csv_file_path, encoding=detected_encoding)
    print("CSV file read successfully:", crosswalk_df)
except UnicodeDecodeError:
    print("Unable to read CSV file with detected encoding.")

Detected encoding: ISO-8859-1
CSV file read successfully:     country.x dataset_key.x  dataset_party_id.x name_short.x  \
0         ALB     manifesto               75951       OMONIA   
1         ALB     manifesto               75624           PD   
2         ALB     manifesto               75722         PRSH   
3         ALB     manifesto               75220          PPS   
4         AUT     manifesto               42710         BZï¿   
..        ...           ...                 ...          ...   
390       GBR     manifesto               51421      LibDems   
391       GBR     manifesto               51901           PC   
392       GBR     manifesto               51902          SNP   
393       GBR     manifesto               51210           SF   
394       GBR     manifesto               51951         UKIP   

                                         name.x  \
0    Partia Bashkimi pï¿½ï¿½r tï¿½ï¿½ Drejtat e   
1             Partia Demokratike e Shqipï¿½ï¿½r   
2                  P

In [97]:
joined_df['country_abbrev'].value_counts()

country_abbrev
DEN    694200
ITA    538272
GER    472056
FRA    386616
SPA    371664
AUT    299040
UKI     79032
BEL     57672
POR     44856
Name: count, dtype: int64

In [113]:
crosswalk_df['country_name'] = crosswalk_df['country.x'].apply(lambda x: pycountry.countries.get(alpha_3=x).name if pycountry.countries.get(alpha_3=x) else x)

In [114]:
crosswalk_df['country_name']

0             Albania
1             Albania
2             Albania
3             Albania
4             Austria
            ...      
390    United Kingdom
391    United Kingdom
392    United Kingdom
393    United Kingdom
394    United Kingdom
Name: country_name, Length: 395, dtype: object