In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import pycountry

In [3]:
import re

In [4]:
from fuzzywuzzy import fuzz



In [5]:
import os 

In [6]:
def clean_and_prepare(df_inequality,df_elections):
    df_inequality['country'] = df_inequality['file'].str[:2]
    df_inequality['country_name'] = df_inequality['country'].apply(get_country_name)
    
    df_inequality.dropna(subset=['region'], inplace=True)
    df_elections.dropna(subset=['regionname'], inplace=True)

    df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))
    df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

    df_inequality['cleaned_region'] = df_inequality['cleaned_region'].str.strip()

    countries_to_keep = df_inequality['country_name'].unique()
    df_elections = df_elections[df_elections['country'].isin(countries_to_keep)]

    return df_inequality, df_elections

In [5]:
def clean_and_join_data(electoral_data_path, inequality_data_path, similarity_threshold=0.7):
    # Step 1: Import electoral data
    df_elections = pd.read_csv(electoral_data_path, delimiter=",", encoding='UTF-8')

    # Step 2: Import inequality data
    df_inequality = pd.read_fwf(inequality_data_path, skiprows=3, header=None,
                                names=["region", "year", "avg_gini"])

    # Step 3: Clean region names for both datasets
    df_inequality['cleaned_region'] = df_inequality['region'].str.lower().str.replace('[^\w\s]', '')
    df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

    # Calculate TF-IDF vectors and cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    X_inequality = tfidf_vectorizer.fit_transform(df_inequality['cleaned_region'])
    X_elections = tfidf_vectorizer.transform(df_elections['cleaned_region'])
    similarity_matrix = cosine_similarity(X_elections, X_inequality)

    # Initialize lists to store predicted matches and confidence scores
    predicted_matches = []
    confidence_scores = []

    # Step 4: Iterate through electoral data and find matches
    for i in range(len(df_elections)):
        best_match_index = np.argmax(similarity_matrix[i])
        best_similarity_score = similarity_matrix[i][best_match_index]

        if best_similarity_score > similarity_threshold:
            predicted_match = df_inequality.iloc[best_match_index]['region']
        else:
            predicted_match = None

        predicted_matches.append(predicted_match)
        confidence_scores.append(best_similarity_score)

    df_elections['predicted_region'] = predicted_matches
    df_elections['confidence_score'] = confidence_scores

    # Step 5: Merge the two datasets based on predicted regions and years
    joined_data = df_elections.merge(df_inequality, left_on=['predicted_region', 'year'],
                                     right_on=['region', 'year'], how='left')

    return joined_data

In [7]:
def get_country_name(abbreviation):
    try:
        country = pycountry.countries.get(alpha_2=abbreviation)
        return country.name
    except AttributeError:
        return None

In [80]:
def clean_region(region):
    # Check if the region starts with '[' and ends with ']'
    if re.match(r'\[\d+\](.+)', region):
        # Extract the text within square brackets
        cleaned_region = re.match(r'\[\d+\](.+)', region).group(1)
    else:
        # For non-Spanish regions, remove everything before the last hyphen '-'
        cleaned_region = re.sub(r'^.*-', '', region)
    
    return cleaned_region.strip()

In [90]:
def clean_inequality_region(df):
    df.reset_index(drop=True, inplace=True)
    
    df.loc[df['country_name'] == 'Spain', 'cleaned_region'] = df['region'].str.lower().apply(lambda x: re.sub(r'\[\d+\](.+)', r'\1', x))
    df.loc[df['country_name'] != 'Spain', 'cleaned_region'] = df['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))
    df['cleaned_region'] = df['cleaned_region'].str.strip()

    return df

In [8]:
save_folder_relative = os.path.join('..', '..', 'data', 'raw','national_election','eu_ned_national_nuts2(1).csv')
electoral_data_path = os.path.abspath(save_folder_relative)

In [9]:
df_elections = pd.read_csv(electoral_data_path, delimiter=",", encoding='UTF-8')


In [21]:
ineq_folder_relative = os.path.join('..', '..', 'data', 'raw','lissy','multination_gini_1.txt')
inequality_data_path = os.path.abspath(ineq_folder_relative)

In [83]:

df_inequality = pd.read_fwf(inequality_data_path, skiprows=3, header=None,
                                names=["file","region", "year", "avg_gini"])

In [23]:
df1,df2 = clean_and_prepare(df_inequality,df_elections)

In [84]:
df_inequality['country'] = df_inequality['file'].str[:2]

In [85]:
df_inequality['country_name'] = df_inequality['country'].apply(get_country_name)

In [86]:
df_inequality.dropna(subset=['region'], inplace=True)
df_elections.dropna(subset=['regionname'], inplace=True)

In [91]:
df_inequality = clean_inequality_region(df_inequality)

In [101]:
df_inequality[(df_inequality['country_name'] == 'Spain') & (df_inequality['avg_gini'].isna())]

Unnamed: 0,file,region,year,avg_gini,country,country_name,cleaned_region


In [75]:
df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))
df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

In [26]:
joined_data = df_elections.merge(df_inequality, left_on=['country','regionname', 'year'],
                                right_on=['country_name','predicted_region', 'year'], how='left')

In [33]:
df_inequality[df_inequality['country_name'] == 'Italy'][['country_name','region', 'predicted_region','cleaned_region', 'confidence_score']]

Unnamed: 0,country_name,region,predicted_region,cleaned_region,confidence_score
864,Italy,[10]ITI2 - Umbria,,umbria,0.0
865,Italy,[11]ITI3 - Marche,,marche,0.0
866,Italy,[12]ITI4 - Lazio,,lazio,0.0
867,Italy,[13]ITF1 - Abruzzo,,abruzzo,0.0
868,Italy,[14]ITF2 - Molise,,molise,0.0
...,...,...,...,...,...
95,Italy,[5]ITH3 - Veneto,,veneto,0.0
96,Italy,[6]ITH4 - Friuli,,friuli,0.0
97,Italy,[7]ITC3 - Liguria,,liguria,0.0
98,Italy,[8]ITH5 - Emilia Romagna,,emilia romagna,0.0


In [28]:
df_elections[df_elections['country'] == 'Italy']['cleaned_region'].value_counts()

cleaned_region
puglia                                 115
campania                               113
sardegna                               111
veneto                                 110
sicilia                                109
calabria                               108
lombardia                              108
toscana                                107
piemonte                               103
abruzzo                                102
provincia autonoma di bolzano/bozen    101
emilia-romagna                         101
molise                                 100
provincia autonoma di trento           100
liguria                                100
lazio                                  100
friuli-venezia giulia                   99
basilicata                              98
marche                                  96
umbria                                  95
valle d'aosta/vallée d'aoste            52
Name: count, dtype: int64

What if we just do it with minimal cleaning 

In [59]:
df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))
df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

In [60]:
df_inequality['cleaned_region'] = df_inequality['cleaned_region'].str.strip()

In [116]:
simple_join = df_inequality.merge(df_elections, left_on=['country_name','cleaned_region', 'year'], right_on=['country','cleaned_region', 'year'], how='left')

In [95]:
simple_join_t = df_inequality.merge(df_elections, left_on=['country_name', 'year'], right_on=['country', 'year'], how='left')

In [101]:
simple_join_t.columns

Index(['file', 'region', 'year', 'avg_gini', 'country_x', 'country_name',
       'cleaned_region_x', 'country_y', 'country_code', 'nutslevel', 'nuts2',
       'regionname', 'type', 'party_abbreviation', 'party_english',
       'party_native', 'partyfacts_id', 'partyvote', 'electorate', 'totalvote',
       'validvote', 'cleaned_region_y'],
      dtype='object')

In [104]:
simple_join_t['country_y'].value_counts()

country_y
France     42365
Germany    41290
Spain      25203
Austria     8627
Italy       3840
Belgium     1323
Name: count, dtype: int64

In [79]:
simple_join.dropna(subset=['party_native'], inplace=True)

In [120]:
simple_join[simple_join['country_y'].isna()][['country_name','country_x','country_y','cleaned_region','year','party_native','avg_gini']]

Unnamed: 0,country_name,country_x,country_y,cleaned_region,year,party_native,avg_gini
0,Austria,at,,nordburgenland,2000,,0.266
1,Austria,at,,südburgenland,2000,,0.246
2,Austria,at,,eisenwurzen,2000,,0.261
3,Austria,at,,süd,2000,,0.254
4,Austria,at,,sankt pölten,2000,,0.256
...,...,...,...,...,...,...,...
2763,Italy,it,,veneto,1998,,0.322
2764,Italy,it,,friuli,1998,,0.310
2765,Italy,it,,liguria,1998,,0.321
2766,Italy,it,,emilia romagna,1998,,0.315


In [128]:
countries_to_keep = df_inequality['country_name'].unique()

In [129]:
df_elections = df_elections[df_elections['country'].isin(countries_to_keep)]

In [158]:
for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    print(f"{country}: {years}")

Austria: [1990 1994 1995 2002 2006 2008 2013 2017 2019]
Belgium: [1991 1995 1999 2003 2007 2010 2014 2019]
France: [1993 1997 2002 2007 2012 2017]
Germany: [1990 1994 1998 2002 2005 2009 2013 2017]
Italy: [1992 1994 1996 2001 2006 2008 2013 2018]
Spain: [1993 1996 2000 2004 2008 2011 2015 2016 2019]


In [164]:
elec_country = []
elec_years = []

for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    elec_country.append(country)
    elec_years.append(years)

election_df = pd.DataFrame({'country': elec_country, 'years': elec_years})

In [130]:
elec_data = []

for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    for year in years:
        elec_data.append({'country': country, 'year': year})

election_df = pd.DataFrame(elec_data)

In [131]:
election_df['flag'] = 1

In [133]:
inequality_with_elections = df_inequality.merge(election_df, left_on=['country_name', 'year'], right_on=['country', 'year'], how='outer')

In [134]:
df['country_name'].fillna(df['country_y'], inplace=True)

In [135]:
inequality_with_elections['flag'].fillna(0, inplace=True)

In [136]:
df = inequality_with_elections[['country_name', 'cleaned_region', 'year', 'avg_gini', 'flag','country_y']]

In [108]:
df.sort_values(['country_name', 'cleaned_region', 'year'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(['country_name', 'cleaned_region', 'year'], inplace=True)


In [109]:
df['gini_since'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.expanding().mean().shift(fill_value=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gini_since'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.expanding().mean().shift(fill_value=0))


In [110]:
df['change_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.diff().cumsum().shift(fill_value=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['change_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.diff().cumsum().shift(fill_value=0))


In [None]:
df.groupby(['group_ids', df['event_today_in_group'].shift().cumsum()])['days_since_last_event'].cumsum()

In [137]:
df['change_in_avg_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].diff()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['change_in_avg_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].diff()


In [139]:
df[df['cleaned_region'] == 'lombardia'][['country_name', 'cleaned_region', 'year', 'avg_gini', 'flag','change_in_avg_gini']].sort_values('year')

Unnamed: 0,country_name,cleaned_region,year,avg_gini,flag,change_in_avg_gini
1994,Italy,lombardia,1989,0.3,0.0,-0.177
2013,Italy,lombardia,1991,0.276,0.0,-0.024
2032,Italy,lombardia,1993,0.327,0.0,0.051
2051,Italy,lombardia,1995,0.333,0.0,0.006
2071,Italy,lombardia,1998,0.347,0.0,0.014
1855,Italy,lombardia,2000,0.341,0.0,
1875,Italy,lombardia,2004,0.326,0.0,-0.015
1895,Italy,lombardia,2008,0.336,1.0,
1915,Italy,lombardia,2010,0.33,0.0,0.004
1935,Italy,lombardia,2014,0.34,0.0,0.01


In [115]:
df[(df['country_name'] == 'Spain') & (df['cleaned_region'] == 'andalucía')]

Unnamed: 0,country_name,cleaned_region,year,avg_gini,flag,country_y,gini_since,change_gini
986,Spain,andalucía,2004,0.349,1.0,Spain,0.0,0.0
1004,Spain,andalucía,2005,0.348,0.0,,0.0,0.0
1023,Spain,andalucía,2006,0.345,0.0,,0.348,
1042,Spain,andalucía,2007,0.353,0.0,,0.3465,-0.003
1061,Spain,andalucía,2008,0.345,1.0,Spain,0.349,
1080,Spain,andalucía,2009,0.348,0.0,,0.348667,0.005
1099,Spain,andalucía,2010,0.357,0.0,,0.3485,0.0
1118,Spain,andalucía,2011,0.354,1.0,Spain,0.347,-0.004
1137,Spain,andalucía,2012,0.347,0.0,,0.3502,0.009
1156,Spain,andalucía,2013,0.358,0.0,,0.349667,-0.001


In [113]:
df[df['country_name'] == 'Spain']['cleaned_region'].value_counts()

cleaned_region
canarias                             17
cantabria                            17
la rioja                             17
extremadura                          17
galicia                              17
comunidad valenciana                 17
illes balears                        17
país vasco                           16
cataluña                             16
comunidad de madrid                  16
andalucía                            16
ciudad autónoma de ceuta             16
castilla-la mancha                   16
castilla y león                      16
principado de asturias               16
aragón                               16
región de murcia                     16
comunidad foral de navarra           16
ciudad autónoma de melilla           15
madrid                                8
east                                  8
south                                 8
north-west                            8
central                               8
canary islands           

In [38]:
election_years_df = df[df['flag'] == 1]

In [39]:
election_years_df.reset_index(drop=True, inplace=True)

In [369]:
joined_data = election_years_df.merge(df_elections, left_on=['country_name', 'cleaned_region', 'year'], right_on=['country', 'cleaned_region', 'year'], how='left')

In [370]:
joined_data.dropna(subset=['party_native'], inplace=True)   

In [371]:
joined_data['country_name'].value_counts()

country_name
France     537
Germany    251
Italy      125
Austria     36
Name: count, dtype: int64

In [372]:
election_years_df.groupby('country_name')['year'].unique()

country_name
Austria    [2006, 2008, 2013, 2017, 2019, 1994, 1995, 199...
Belgium     [1995, 2003, 2007, 2010, 2014, 1991, 1999, 2019]
France                  [1997, 2002, 2007, 2012, 2017, 1993]
Germany     [1994, 1998, 2002, 2005, 2009, 2013, 2017, 1990]
Italy       [2008, 1992, 1994, 1996, 2001, 2006, 2013, 2018]
Spain      [2004, 2008, 2011, 2015, 2016, 2019, 1993, 199...
Name: year, dtype: object

In [373]:
df_elections.groupby('country')['year'].unique()

country
Austria    [1990, 1994, 1995, 2002, 2006, 2008, 2013, 201...
Belgium     [1991, 1995, 1999, 2003, 2007, 2010, 2014, 2019]
France                  [1993, 1997, 2002, 2007, 2012, 2017]
Germany     [1990, 1994, 1998, 2002, 2005, 2009, 2013, 2017]
Italy       [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
Spain      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
Name: year, dtype: object

In [256]:
def compare_dfs(country):
    country_elections = df_elections[df_elections['country'] == country]
    country_eydf = election_years_df[election_years_df['country_name'] == country]
    print(country_elections.groupby('cleaned_region')['year'].unique(), country_eydf.groupby('cleaned_region')['year'].unique())

In [27]:
def return_country_dfs(country):
    country_elections = df_elections[df_elections['country'] == country]
    country_eydf = election_years_df[election_years_df['country_name'] == country]
    return country_elections, country_eydf

In [40]:
italy1, italy2 = return_country_dfs('Spain')

In [318]:
italy1 = italy1[['country', 'cleaned_region', 'year', 'party_native']]

In [41]:
italy1.groupby('cleaned_region')['year'].unique()

cleaned_region
andalucía                     [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
aragón                        [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
canarias                      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
cantabria                     [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
castilla y león               [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
castilla-la mancha            [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
cataluña                      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
ciudad autónoma de ceuta      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
ciudad autónoma de melilla    [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
comunidad de madrid           [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
comunidad foral de navarra    [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
comunidad valenciana          [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
extremadura              

In [42]:
italy2.groupby('cleaned_region')['year'].unique()

cleaned_region
[11]galicia                       [2004, 2008, 2011, 2015, 2016, 2019]
[12]principado de asturias        [2004, 2008, 2011, 2015, 2016, 2019]
[13]cantabria                     [2004, 2008, 2011, 2015, 2016, 2019]
[21]país vasco                    [2004, 2008, 2011, 2015, 2016, 2019]
[22]comunidad foral de navarra    [2004, 2008, 2011, 2015, 2016, 2019]
[23]la rioja                      [2004, 2008, 2011, 2015, 2016, 2019]
[24]aragón                        [2004, 2008, 2011, 2015, 2016, 2019]
[30]comunidad de madrid           [2004, 2008, 2011, 2015, 2016, 2019]
[3]madrid                                           [1993, 1996, 2000]
[41]castilla y león               [2004, 2008, 2011, 2015, 2016, 2019]
[43]extremadura                   [2004, 2008, 2011, 2015, 2016, 2019]
[4]central                                          [1993, 1996, 2000]
[51]cataluña                      [2004, 2008, 2011, 2015, 2016, 2019]
[52]comunidad valenciana          [2004, 2008, 2011, 2015, 201

In [322]:
join_italy = italy1.merge(italy2, left_on=['cleaned_region', 'year'], right_on=['cleaned_region', 'year'], how='left')

In [323]:
join_italy[join_italy['year'] == 2008]

Unnamed: 0,country,cleaned_region,year,party_native,country_name,avg_gini,flag,gini_since,change_gini
73,Italy,piemonte,2008,Lista di Pietro -- Italia del Valori,Italy,0.276,1.0,0.0,0.0
74,Italy,piemonte,2008,Il Popolo della Liberta,Italy,0.276,1.0,0.0,0.0
75,Italy,piemonte,2008,Fiamma Tricolore,Italy,0.276,1.0,0.0,0.0
76,Italy,piemonte,2008,Sinistra Italiana,Italy,0.276,1.0,0.0,0.0
77,Italy,piemonte,2008,Lega Nord,Italy,0.276,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2099,Italy,lazio,2008,Movimento per l'Autonomia,Italy,0.343,1.0,0.0,0.0
2100,Italy,lazio,2008,,Italy,0.343,1.0,0.0,0.0
2101,Italy,lazio,2008,Partito Democratico,Italy,0.343,1.0,0.0,0.0
2102,Italy,lazio,2008,Partito Socialista Italiano,Italy,0.343,1.0,0.0,0.0


In [319]:
italy2['cleaned_region'] = italy2['cleaned_region'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  italy2['cleaned_region'] = italy2['cleaned_region'].str.strip()


In [320]:
matches = italy1[italy1['cleaned_region'].isin(italy2['cleaned_region'])]
