In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import pycountry

In [3]:
import re

In [4]:
from fuzzywuzzy import fuzz,process



In [5]:
import os 

In [7]:
def clean_and_prepare(df_inequality,df_elections):
    df_inequality['country'] = df_inequality['file'].str[:2]
    df_inequality['country_name'] = df_inequality['country'].apply(get_country_name)
    
    df_inequality.dropna(subset=['region'], inplace=True)
    df_elections.dropna(subset=['regionname'], inplace=True)

    df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))
    df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

    df_inequality['cleaned_region'] = df_inequality['cleaned_region'].str.strip()

    countries_to_keep = df_inequality['country_name'].unique()
    df_elections = df_elections[df_elections['country'].isin(countries_to_keep)]

    return df_inequality, df_elections

In [8]:
def clean_and_join_data(electoral_data_path, inequality_data_path, similarity_threshold=0.7):
    # Step 1: Import electoral data
    df_elections = pd.read_csv(electoral_data_path, delimiter=",", encoding='UTF-8')

    # Step 2: Import inequality data
    df_inequality = pd.read_fwf(inequality_data_path, skiprows=3, header=None,
                                names=["region", "year", "avg_gini"])

    # Step 3: Clean region names for both datasets
    df_inequality['cleaned_region'] = df_inequality['region'].str.lower().str.replace('[^\w\s]', '')
    df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

    # Calculate TF-IDF vectors and cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    X_inequality = tfidf_vectorizer.fit_transform(df_inequality['cleaned_region'])
    X_elections = tfidf_vectorizer.transform(df_elections['cleaned_region'])
    similarity_matrix = cosine_similarity(X_elections, X_inequality)

    # Initialize lists to store predicted matches and confidence scores
    predicted_matches = []
    confidence_scores = []

    # Step 4: Iterate through electoral data and find matches
    for i in range(len(df_elections)):
        best_match_index = np.argmax(similarity_matrix[i])
        best_similarity_score = similarity_matrix[i][best_match_index]

        if best_similarity_score > similarity_threshold:
            predicted_match = df_inequality.iloc[best_match_index]['region']
        else:
            predicted_match = None

        predicted_matches.append(predicted_match)
        confidence_scores.append(best_similarity_score)

    df_elections['predicted_region'] = predicted_matches
    df_elections['confidence_score'] = confidence_scores

    # Step 5: Merge the two datasets based on predicted regions and years
    joined_data = df_elections.merge(df_inequality, left_on=['predicted_region', 'year'],
                                     right_on=['region', 'year'], how='left')

    return joined_data

In [9]:
def get_country_name(abbreviation):
    try:
        country = pycountry.countries.get(alpha_2=abbreviation)
        return country.name
    except AttributeError:
        return None

In [10]:
def clean_region(region):
    # Check if the region starts with '[' and ends with ']'
    if re.match(r'\[\d+\](.+)', region):
        # Extract the text within square brackets
        cleaned_region = re.match(r'\[\d+\](.+)', region).group(1)
    else:
        # For non-Spanish regions, remove everything before the last hyphen '-'
        cleaned_region = re.sub(r'^.*-', '', region)
    
    return cleaned_region.strip()

In [11]:
def clean_inequality_region(df):
    df.reset_index(drop=True, inplace=True)
    
    df.loc[df['country_name'].isin(['Spain','Austria']), 'cleaned_region'] = df['region'].str.lower().apply(lambda x: re.sub(r'\[\d+\](.+)', r'\1', x))
    df.loc[~df['country_name'].isin(['Spain', 'Austria']), 'cleaned_region'] = df['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))
    df['cleaned_region'] = df['cleaned_region'].str.strip()

    return df

In [12]:
def clean_region(region):
    cleaned_region = re.sub(r'^\[\d+\](.+)', r'\1', region)
    cleaned_region = re.sub(r'^.*-', '', cleaned_region)
    
    return cleaned_region.strip()

In [13]:
save_folder_relative = os.path.join('..', '..', 'data', 'raw','national_election','eu_ned_national_nuts2(1).csv')
electoral_data_path = os.path.abspath(save_folder_relative)

In [14]:
df_elections = pd.read_csv(electoral_data_path, delimiter=",", encoding='UTF-8')


In [15]:
ineq_folder_relative = os.path.join('..', '..', 'data', 'raw','lissy','multination_gini_1.txt')
inequality_data_path = os.path.abspath(ineq_folder_relative)

In [16]:

df_inequality = pd.read_fwf(inequality_data_path, skiprows=3, header=None,
                                names=["file","region", "year", "avg_gini"])

In [23]:
df1,df2 = clean_and_prepare(df_inequality,df_elections)

In [17]:
df_inequality['country'] = df_inequality['file'].str[:2]
df_inequality['country_name'] = df_inequality['country'].apply(get_country_name)

In [18]:
df_inequality.dropna(subset=['region'], inplace=True)
df_elections.dropna(subset=['regionname'], inplace=True)

In [141]:
df_inequality = clean_inequality_region(df_inequality)

In [159]:
df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(clean_region)

In [177]:
df_elections['cleaned_region'] = df_elections['regionname'].str.lower().apply(clean_region)

In [160]:
df_inequality[df_inequality['country_name'] == 'Austria']['cleaned_region'].unique()

array(['nordburgenland', 'südburgenland', 'eisenwurzen', 'süd',
       'sankt pölten', 'waldviertel', 'weinviertel',
       'wiener umland/nordteil', 'wiener umland/südteil', 'wien',
       'villach', 'oberkärnten', 'unterkärnten', 'graz', 'liezen',
       'östliche obersteiermark', 'oststeiermark', 'und südsteiermark',
       'westliche obersteiermark', 'innviertel', 'wels', 'mühlviertel',
       'kirchdorf', 'traunviertel', 'lungau', 'pongau',
       'vsalzburg und umgebung', 'außerfern', 'innsbruck', 'osttirol',
       'tiroler oberland', 'tiroler unterland', 'bregenzer wald',
       'bodenseegebiet', 'burgenland', 'niederösterreich', 'kärnten',
       'steiermark', 'oberösterreich', 'salzburg', 'tirol', 'vorarlberg',
       'mittelburgenland', 'südösterreich', 'westösterreich'],
      dtype=object)

In [101]:
df_inequality[(df_inequality['country_name'] == 'Spain') & (df_inequality['avg_gini'].isna())]

Unnamed: 0,file,region,year,avg_gini,country,country_name,cleaned_region


In [75]:
df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))


In [16]:
df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

In [26]:
joined_data = df_elections.merge(df_inequality, left_on=['country','regionname', 'year'],
                                right_on=['country_name','predicted_region', 'year'], how='left')

In [33]:
df_inequality[df_inequality['country_name'] == 'Italy'][['country_name','region', 'predicted_region','cleaned_region', 'confidence_score']]

Unnamed: 0,country_name,region,predicted_region,cleaned_region,confidence_score
864,Italy,[10]ITI2 - Umbria,,umbria,0.0
865,Italy,[11]ITI3 - Marche,,marche,0.0
866,Italy,[12]ITI4 - Lazio,,lazio,0.0
867,Italy,[13]ITF1 - Abruzzo,,abruzzo,0.0
868,Italy,[14]ITF2 - Molise,,molise,0.0
...,...,...,...,...,...
95,Italy,[5]ITH3 - Veneto,,veneto,0.0
96,Italy,[6]ITH4 - Friuli,,friuli,0.0
97,Italy,[7]ITC3 - Liguria,,liguria,0.0
98,Italy,[8]ITH5 - Emilia Romagna,,emilia romagna,0.0


In [28]:
df_elections[df_elections['country'] == 'Italy']['cleaned_region'].value_counts()

cleaned_region
puglia                                 115
campania                               113
sardegna                               111
veneto                                 110
sicilia                                109
calabria                               108
lombardia                              108
toscana                                107
piemonte                               103
abruzzo                                102
provincia autonoma di bolzano/bozen    101
emilia-romagna                         101
molise                                 100
provincia autonoma di trento           100
liguria                                100
lazio                                  100
friuli-venezia giulia                   99
basilicata                              98
marche                                  96
umbria                                  95
valle d'aosta/vallée d'aoste            52
Name: count, dtype: int64

What if we just do it with minimal cleaning 

In [59]:
df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))


In [175]:
df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

In [60]:
df_inequality['cleaned_region'] = df_inequality['cleaned_region'].str.strip()

In [116]:
simple_join = df_inequality.merge(df_elections, left_on=['country_name','cleaned_region', 'year'], right_on=['country','cleaned_region', 'year'], how='left')

In [95]:
simple_join_t = df_inequality.merge(df_elections, left_on=['country_name', 'year'], right_on=['country', 'year'], how='left')

In [101]:
simple_join_t.columns

Index(['file', 'region', 'year', 'avg_gini', 'country_x', 'country_name',
       'cleaned_region_x', 'country_y', 'country_code', 'nutslevel', 'nuts2',
       'regionname', 'type', 'party_abbreviation', 'party_english',
       'party_native', 'partyfacts_id', 'partyvote', 'electorate', 'totalvote',
       'validvote', 'cleaned_region_y'],
      dtype='object')

In [104]:
simple_join_t['country_y'].value_counts()

country_y
France     42365
Germany    41290
Spain      25203
Austria     8627
Italy       3840
Belgium     1323
Name: count, dtype: int64

In [79]:
simple_join.dropna(subset=['party_native'], inplace=True)

In [120]:
simple_join[simple_join['country_y'].isna()][['country_name','country_x','country_y','cleaned_region','year','party_native','avg_gini']]

Unnamed: 0,country_name,country_x,country_y,cleaned_region,year,party_native,avg_gini
0,Austria,at,,nordburgenland,2000,,0.266
1,Austria,at,,südburgenland,2000,,0.246
2,Austria,at,,eisenwurzen,2000,,0.261
3,Austria,at,,süd,2000,,0.254
4,Austria,at,,sankt pölten,2000,,0.256
...,...,...,...,...,...,...,...
2763,Italy,it,,veneto,1998,,0.322
2764,Italy,it,,friuli,1998,,0.310
2765,Italy,it,,liguria,1998,,0.321
2766,Italy,it,,emilia romagna,1998,,0.315


In [161]:
countries_to_keep = df_inequality['country_name'].unique()

In [162]:
df_elections = df_elections[df_elections['country'].isin(countries_to_keep)]

In [158]:
for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    print(f"{country}: {years}")

Austria: [1990 1994 1995 2002 2006 2008 2013 2017 2019]
Belgium: [1991 1995 1999 2003 2007 2010 2014 2019]
France: [1993 1997 2002 2007 2012 2017]
Germany: [1990 1994 1998 2002 2005 2009 2013 2017]
Italy: [1992 1994 1996 2001 2006 2008 2013 2018]
Spain: [1993 1996 2000 2004 2008 2011 2015 2016 2019]


In [164]:
elec_country = []
elec_years = []

for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    elec_country.append(country)
    elec_years.append(years)

election_df = pd.DataFrame({'country': elec_country, 'years': elec_years})

In [163]:
elec_data = []

for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    for year in years:
        elec_data.append({'country': country, 'year': year})

election_df = pd.DataFrame(elec_data)

In [164]:
election_df['flag'] = 1

Begin section with test code to better mark election years

In [74]:
election_df_italy = election_df[election_df['country'] == 'Italy']

In [84]:
election_df_italy

Unnamed: 0,country,year,flag
31,Italy,1992,1
32,Italy,1994,1
33,Italy,1996,1
34,Italy,2001,1
35,Italy,2006,1
36,Italy,2008,1
37,Italy,2013,1
38,Italy,2018,1


In [75]:
inequality_df_italy = df_inequality[df_inequality['country_name'] == 'Italy']

In [80]:
italian_regions = inequality_df_italy[['country_name','cleaned_region']]

In [82]:
italian_regions.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  italian_regions.drop_duplicates(inplace=True)


In [85]:
italian_join = italian_regions.merge(election_df_italy, left_on=['country_name'], right_on=['country'], how='left')

In [86]:
italian_join

Unnamed: 0,country_name,cleaned_region,country,year,flag
0,Italy,umbria,Italy,1992,1
1,Italy,umbria,Italy,1994,1
2,Italy,umbria,Italy,1996,1
3,Italy,umbria,Italy,2001,1
4,Italy,umbria,Italy,2006,1
...,...,...,...,...,...
155,Italy,toscana,Italy,2001,1
156,Italy,toscana,Italy,2006,1
157,Italy,toscana,Italy,2008,1
158,Italy,toscana,Italy,2013,1


In [91]:
italian_inequality_join = inequality_df_italy.merge(italian_join, left_on=['country_name','cleaned_region','year'], right_on=['country_name','cleaned_region','year'], how='outer')

In [92]:
italian_inequality_join

Unnamed: 0,file,region,year,avg_gini,country_x,country_name,cleaned_region,country_y,flag
0,it00h,[10]ITI2 - Umbria,2000,0.262,it,Italy,umbria,,
1,it00h,[11]ITI3 - Marche,2000,0.316,it,Italy,marche,,
2,it00h,[12]ITI4 - Lazio,2000,0.312,it,Italy,lazio,,
3,it00h,[13]ITF1 - Abruzzo,2000,0.375,it,Italy,abruzzo,,
4,it00h,[14]ITF2 - Molise,2000,0.421,it,Italy,molise,,
...,...,...,...,...,...,...,...,...,...
371,,,1996,,,Italy,toscana,Italy,1.0
372,,,2001,,,Italy,toscana,Italy,1.0
373,,,2006,,,Italy,toscana,Italy,1.0
374,,,2013,,,Italy,toscana,Italy,1.0


success! let's try with every country 

In [165]:
regions = df_inequality[['country_name','cleaned_region']]

In [166]:
regions.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regions.drop_duplicates(inplace=True)


In [167]:
region_join = regions.merge(election_df, left_on=['country_name'], right_on=['country'], how='left')

In [168]:
full_region_join = df_inequality.merge(region_join, left_on=['country_name','cleaned_region','year'], right_on=['country_name','cleaned_region','year'], how='outer')

In [99]:
full_region_join[full_region_join['country_name'] == 'Italy']

Unnamed: 0,file,region,year,avg_gini,country_x,country_name,cleaned_region,country_y,flag
1842,it00h,[10]ITI2 - Umbria,2000,0.262,it,Italy,umbria,,0.0
1843,it00h,[11]ITI3 - Marche,2000,0.316,it,Italy,marche,,0.0
1844,it00h,[12]ITI4 - Lazio,2000,0.312,it,Italy,lazio,,0.0
1845,it00h,[13]ITF1 - Abruzzo,2000,0.375,it,Italy,abruzzo,,0.0
1846,it00h,[14]ITF2 - Molise,2000,0.421,it,Italy,molise,,0.0
...,...,...,...,...,...,...,...,...,...
2777,,,1996,,,Italy,toscana,Italy,1.0
2778,,,2001,,,Italy,toscana,Italy,1.0
2779,,,2006,,,Italy,toscana,Italy,1.0
2780,,,2013,,,Italy,toscana,Italy,1.0


In [169]:
full_region_join['flag'].fillna(0, inplace=True)

In [170]:
df = full_region_join[['country_name', 'cleaned_region', 'year', 'avg_gini', 'flag']]

In [171]:
df['interp_gini'] = df.sort_values(by=['cleaned_region', 'year'])['avg_gini'].transform(lambda x: x.interpolate())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['interp_gini'] = df.sort_values(by=['cleaned_region', 'year'])['avg_gini'].transform(lambda x: x.interpolate())


In [111]:
df[df['cleaned_region'].isin(['lombardia','toscana'])].sort_values(by=['cleaned_region','year'])

Unnamed: 0,country_name,cleaned_region,year,avg_gini,flag,interp_gini
1994,Italy,lombardia,1989,0.3,0.0,0.3
2013,Italy,lombardia,1991,0.276,0.0,0.276
2733,Italy,lombardia,1992,,1.0,0.3015
2032,Italy,lombardia,1993,0.327,0.0,0.327
2734,Italy,lombardia,1994,,1.0,0.33
2051,Italy,lombardia,1995,0.333,0.0,0.333
2735,Italy,lombardia,1996,,1.0,0.34
2071,Italy,lombardia,1998,0.347,0.0,0.347
1855,Italy,lombardia,2000,0.341,0.0,0.341
2736,Italy,lombardia,2001,,1.0,0.3335


End section with test code 

fix france

In [191]:
french_df = df[df['country_name'] == 'France']

In [192]:
french_elections = df_elections[df_elections['country'] == 'France']

In [20]:
def find_best_match(region, choices):
    match, score, index = process.extractOne(region, choices, scorer=fuzz.ratio)
    return match, score

In [203]:
region_mapping = {}

In [204]:
for region1 in french_df['cleaned_region']:
    match, score = find_best_match(region1, french_elections['cleaned_region'])
    
    if score >= 70:
        region_mapping[region1] = match


In [18]:
def map_regions(df,df2):

    region_mapping = {}

    for region1 in df['cleaned_region']:
        match, score = find_best_match(region1, df2['cleaned_region'])

        if score >= 70:
            region_mapping[region1] = match

    df['predicted_region'] = df['cleaned_region'].map(region_mapping)

    return df
 

In [209]:
df = map_regions(french_df,french_elections)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted_region'] = df['cleaned_region'].map(region_mapping)


In [211]:
df['cleaned_region'].unique()

array(['alsace', 'aquitaine', 'auvergne', 'normandie', 'bourgogne',
       'bretagne', 'centre', 'ardenne', 'comté', 'france', 'roussillon',
       'limousin', 'lorraine', 'pyrénées', 'pas de calais',
       'pays de la loire', 'picardie', 'charentes', 'corse', 'alpes',
       "d'azur"], dtype=object)

In [212]:
df['predicted_region'].unique()

array(['alsace', 'aquitaine', 'auvergne', 'normandie', 'bourgogne',
       'bretagne', nan, 'ardenne', 'comté', 'france', 'roussillon',
       'limousin', 'lorraine', 'pyrénées', 'pas de calais',
       'pays de la loire', 'picardie', 'charentes', 'corse', 'alpes',
       'côte d’azur'], dtype=object)

In [213]:
french_elections['cleaned_region'].unique()

array(['france', 'val de loire', 'bourgogne', 'comté', 'normandie',
       'pas de calais', 'picardie', 'alsace', 'ardenne', 'lorraine',
       'pays de la loire', 'bretagne', 'aquitaine', 'limousin',
       'charentes', 'roussillon', 'pyrénées', 'auvergne', 'alpes',
       'côte d’azur', 'corse', 'guadeloupe', 'martinique', 'guyane',
       'la réunion', 'mayotte'], dtype=object)

In [205]:
print(region_mapping)

{'alsace': 'alsace', 'aquitaine': 'aquitaine', 'auvergne': 'auvergne', 'normandie': 'normandie', 'bourgogne': 'bourgogne', 'bretagne': 'bretagne', 'ardenne': 'ardenne', 'comté': 'comté', 'france': 'france', 'roussillon': 'roussillon', 'limousin': 'limousin', 'lorraine': 'lorraine', 'pyrénées': 'pyrénées', 'pas de calais': 'pas de calais', 'pays de la loire': 'pays de la loire', 'picardie': 'picardie', 'charentes': 'charentes', 'corse': 'corse', 'alpes': 'alpes', "d'azur": 'côte d’azur'}


In [197]:
df['mapped_region'] = df['cleaned_region'].map(region_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mapped_region'] = df['cleaned_region'].map(region_mapping)


end france

In [22]:
inequality_with_elections = df_inequality.merge(election_df, left_on=['country_name', 'year'], right_on=['country', 'year'], how='outer')

In [23]:
inequality_with_elections['flag'].fillna(0, inplace=True)

In [24]:
df = inequality_with_elections[['country_name', 'cleaned_region', 'year', 'avg_gini', 'flag','country_y']]

In [25]:
df['country_name'].fillna(df['country_y'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['country_name'].fillna(df['country_y'], inplace=True)


In [50]:
election_df[election_df['country'] == 'Italy']

Unnamed: 0,country,year,flag
31,Italy,1992,1
32,Italy,1994,1
33,Italy,1996,1
34,Italy,2001,1
35,Italy,2006,1
36,Italy,2008,1
37,Italy,2013,1
38,Italy,2018,1


In [58]:
df[df['cleaned_region'] == 'lombardia'][['country_name','cleaned_region', 'year', 'avg_gini', 'flag']]

Unnamed: 0,country_name,cleaned_region,year,avg_gini,flag
1896,Italy,lombardia,2000,0.341,0.0
1917,Italy,lombardia,2004,0.326,0.0
1940,Italy,lombardia,2008,0.336,1.0
1961,Italy,lombardia,2010,0.33,0.0
1984,Italy,lombardia,2014,0.34,0.0
2005,Italy,lombardia,2016,0.347,0.0
2028,Italy,lombardia,2020,0.477,0.0
2048,Italy,lombardia,1995,0.333,0.0
2070,Italy,lombardia,1998,0.347,0.0
2094,Italy,lombardia,1989,0.3,0.0


In [26]:
df.sort_values(['country_name', 'cleaned_region', 'year'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(['country_name', 'cleaned_region', 'year'], inplace=True)


In [109]:
df['gini_since'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.expanding().mean().shift(fill_value=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gini_since'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.expanding().mean().shift(fill_value=0))


In [110]:
df['change_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.diff().cumsum().shift(fill_value=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['change_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.diff().cumsum().shift(fill_value=0))


In [None]:
df.groupby(['group_ids', df['event_today_in_group'].shift().cumsum()])['days_since_last_event'].cumsum()

In [27]:
df['change_in_avg_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].diff()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['change_in_avg_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].diff()


In [29]:
df['avg_gini_between_flags'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.rolling(len(x), min_periods=1).mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_gini_between_flags'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.rolling(len(x), min_periods=1).mean())


In [31]:
df[df['cleaned_region'] == 'lombardia'][['country_name', 'cleaned_region', 'year', 'avg_gini', 'flag','change_in_avg_gini','avg_gini_between_flags']].sort_values('year')

Unnamed: 0,country_name,cleaned_region,year,avg_gini,flag,change_in_avg_gini,avg_gini_between_flags
1994,Italy,lombardia,1989,0.3,0.0,,0.3
2013,Italy,lombardia,1991,0.276,0.0,-0.024,0.288
2032,Italy,lombardia,1993,0.327,0.0,0.051,0.301
2051,Italy,lombardia,1995,0.333,0.0,0.006,0.309
2071,Italy,lombardia,1998,0.347,0.0,0.014,0.3166
1855,Italy,lombardia,2000,0.341,0.0,-0.006,0.320667
1875,Italy,lombardia,2004,0.326,0.0,-0.015,0.321429
1895,Italy,lombardia,2008,0.336,1.0,,0.336
1915,Italy,lombardia,2010,0.33,0.0,0.004,0.3225
1935,Italy,lombardia,2014,0.34,0.0,0.01,0.324444


In [115]:
df[(df['country_name'] == 'Spain') & (df['cleaned_region'] == 'andalucía')]

Unnamed: 0,country_name,cleaned_region,year,avg_gini,flag,country_y,gini_since,change_gini
986,Spain,andalucía,2004,0.349,1.0,Spain,0.0,0.0
1004,Spain,andalucía,2005,0.348,0.0,,0.0,0.0
1023,Spain,andalucía,2006,0.345,0.0,,0.348,
1042,Spain,andalucía,2007,0.353,0.0,,0.3465,-0.003
1061,Spain,andalucía,2008,0.345,1.0,Spain,0.349,
1080,Spain,andalucía,2009,0.348,0.0,,0.348667,0.005
1099,Spain,andalucía,2010,0.357,0.0,,0.3485,0.0
1118,Spain,andalucía,2011,0.354,1.0,Spain,0.347,-0.004
1137,Spain,andalucía,2012,0.347,0.0,,0.3502,0.009
1156,Spain,andalucía,2013,0.358,0.0,,0.349667,-0.001


In [113]:
df[df['country_name'] == 'Spain']['cleaned_region'].value_counts()

cleaned_region
canarias                             17
cantabria                            17
la rioja                             17
extremadura                          17
galicia                              17
comunidad valenciana                 17
illes balears                        17
país vasco                           16
cataluña                             16
comunidad de madrid                  16
andalucía                            16
ciudad autónoma de ceuta             16
castilla-la mancha                   16
castilla y león                      16
principado de asturias               16
aragón                               16
región de murcia                     16
comunidad foral de navarra           16
ciudad autónoma de melilla           15
madrid                                8
east                                  8
south                                 8
north-west                            8
central                               8
canary islands           

In [38]:
election_years_df = df[df['flag'] == 1]

In [39]:
election_years_df.reset_index(drop=True, inplace=True)

In [369]:
joined_data = election_years_df.merge(df_elections, left_on=['country_name', 'cleaned_region', 'year'], right_on=['country', 'cleaned_region', 'year'], how='left')

In [370]:
joined_data.dropna(subset=['party_native'], inplace=True)   

In [371]:
joined_data['country_name'].value_counts()

country_name
France     537
Germany    251
Italy      125
Austria     36
Name: count, dtype: int64

In [372]:
election_years_df.groupby('country_name')['year'].unique()

country_name
Austria    [2006, 2008, 2013, 2017, 2019, 1994, 1995, 199...
Belgium     [1995, 2003, 2007, 2010, 2014, 1991, 1999, 2019]
France                  [1997, 2002, 2007, 2012, 2017, 1993]
Germany     [1994, 1998, 2002, 2005, 2009, 2013, 2017, 1990]
Italy       [2008, 1992, 1994, 1996, 2001, 2006, 2013, 2018]
Spain      [2004, 2008, 2011, 2015, 2016, 2019, 1993, 199...
Name: year, dtype: object

In [373]:
df_elections.groupby('country')['year'].unique()

country
Austria    [1990, 1994, 1995, 2002, 2006, 2008, 2013, 201...
Belgium     [1991, 1995, 1999, 2003, 2007, 2010, 2014, 2019]
France                  [1993, 1997, 2002, 2007, 2012, 2017]
Germany     [1990, 1994, 1998, 2002, 2005, 2009, 2013, 2017]
Italy       [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
Spain      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
Name: year, dtype: object

In [256]:
def compare_dfs(country):
    country_elections = df_elections[df_elections['country'] == country]
    country_eydf = election_years_df[election_years_df['country_name'] == country]
    print(country_elections.groupby('cleaned_region')['year'].unique(), country_eydf.groupby('cleaned_region')['year'].unique())

In [27]:
def return_country_dfs(country):
    country_elections = df_elections[df_elections['country'] == country]
    country_eydf = election_years_df[election_years_df['country_name'] == country]
    return country_elections, country_eydf

In [40]:
italy1, italy2 = return_country_dfs('Spain')

In [318]:
italy1 = italy1[['country', 'cleaned_region', 'year', 'party_native']]

In [41]:
italy1.groupby('cleaned_region')['year'].unique()

cleaned_region
andalucía                     [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
aragón                        [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
canarias                      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
cantabria                     [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
castilla y león               [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
castilla-la mancha            [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
cataluña                      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
ciudad autónoma de ceuta      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
ciudad autónoma de melilla    [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
comunidad de madrid           [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
comunidad foral de navarra    [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
comunidad valenciana          [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
extremadura              

In [42]:
italy2.groupby('cleaned_region')['year'].unique()

cleaned_region
[11]galicia                       [2004, 2008, 2011, 2015, 2016, 2019]
[12]principado de asturias        [2004, 2008, 2011, 2015, 2016, 2019]
[13]cantabria                     [2004, 2008, 2011, 2015, 2016, 2019]
[21]país vasco                    [2004, 2008, 2011, 2015, 2016, 2019]
[22]comunidad foral de navarra    [2004, 2008, 2011, 2015, 2016, 2019]
[23]la rioja                      [2004, 2008, 2011, 2015, 2016, 2019]
[24]aragón                        [2004, 2008, 2011, 2015, 2016, 2019]
[30]comunidad de madrid           [2004, 2008, 2011, 2015, 2016, 2019]
[3]madrid                                           [1993, 1996, 2000]
[41]castilla y león               [2004, 2008, 2011, 2015, 2016, 2019]
[43]extremadura                   [2004, 2008, 2011, 2015, 2016, 2019]
[4]central                                          [1993, 1996, 2000]
[51]cataluña                      [2004, 2008, 2011, 2015, 2016, 2019]
[52]comunidad valenciana          [2004, 2008, 2011, 2015, 201

In [322]:
join_italy = italy1.merge(italy2, left_on=['cleaned_region', 'year'], right_on=['cleaned_region', 'year'], how='left')

In [323]:
join_italy[join_italy['year'] == 2008]

Unnamed: 0,country,cleaned_region,year,party_native,country_name,avg_gini,flag,gini_since,change_gini
73,Italy,piemonte,2008,Lista di Pietro -- Italia del Valori,Italy,0.276,1.0,0.0,0.0
74,Italy,piemonte,2008,Il Popolo della Liberta,Italy,0.276,1.0,0.0,0.0
75,Italy,piemonte,2008,Fiamma Tricolore,Italy,0.276,1.0,0.0,0.0
76,Italy,piemonte,2008,Sinistra Italiana,Italy,0.276,1.0,0.0,0.0
77,Italy,piemonte,2008,Lega Nord,Italy,0.276,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2099,Italy,lazio,2008,Movimento per l'Autonomia,Italy,0.343,1.0,0.0,0.0
2100,Italy,lazio,2008,,Italy,0.343,1.0,0.0,0.0
2101,Italy,lazio,2008,Partito Democratico,Italy,0.343,1.0,0.0,0.0
2102,Italy,lazio,2008,Partito Socialista Italiano,Italy,0.343,1.0,0.0,0.0


In [319]:
italy2['cleaned_region'] = italy2['cleaned_region'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  italy2['cleaned_region'] = italy2['cleaned_region'].str.strip()


In [320]:
matches = italy1[italy1['cleaned_region'].isin(italy2['cleaned_region'])]


## Need to figure out the multiple var import, not working atm

In [6]:
from unidecode import unidecode

In [7]:
def clean_region(region):
    cleaned_region = re.sub(r'^\[\d+\](.+)', r'\1', region)
    cleaned_region = re.sub(r'^.*-', '', cleaned_region)
    
    return cleaned_region.strip()

In [8]:
def clean_region(region):
    cleaned_region = re.sub(r'^\[\d+\](.+)', r'\1', region)
    cleaned_region = re.sub(r'^.*-', '', cleaned_region)
    cleaned_region = unidecode(cleaned_region)
    
    return cleaned_region.strip()

In [9]:
def get_country_name(abbreviation):
    try:
        country = pycountry.countries.get(alpha_2=abbreviation)
        return country.name
    except AttributeError:
        return None

In [10]:
electoral_folder_relative = os.path.join('..', '..', 'data', 'raw','national_election','eu_ned_national_nuts2(1).csv')
electoral_data_path = os.path.abspath(electoral_folder_relative)

ineq_folder_relative = os.path.join('..', '..', 'data', 'raw','lissy','multination_gini.txt')
inequality_data_path = os.path.abspath(ineq_folder_relative)

results_folder_relative = os.path.join('..', '..', 'data', 'cleaned','national','joined_electoral_lissy.csv')
results_data_path = os.path.abspath(results_folder_relative)

ineq_im_ed_folder_relative = os.path.join('..', '..', 'data', 'raw', 'lissy', 'multination_im_ed.txt')
ineq_im_ed_data_path = os.path.abspath(ineq_im_ed_folder_relative)

df_elections = pd.read_csv(electoral_data_path, delimiter=",", encoding='UTF-8')

df_inequality = pd.read_fwf(inequality_data_path, skiprows=3, header=None,
                                names=["file","region", "year", "avg_gini"])

df_inequality_im_ed = pd.read_fwf(ineq_im_ed_data_path, skiprows=3, header=None,
                                    names=["file", "region", "year", "im_ratio", "ed_ratio"])

In [11]:
df_inequality['country'] = df_inequality['file'].str[:2]
df_inequality['country_name'] = df_inequality['country'].apply(get_country_name)

df_inequality_im_ed['country'] = df_inequality_im_ed['file'].str[:2]
df_inequality_im_ed['country_name'] = df_inequality_im_ed['country'].apply(get_country_name)

df_inequality.dropna(subset=['region'], inplace=True)
df_elections.dropna(subset=['regionname'], inplace=True)
df_inequality_im_ed.dropna(subset=['region'], inplace=True)

In [12]:
df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(clean_region)
df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')
df_inequality_im_ed['cleaned_region'] = df_inequality_im_ed['region'].str.lower().apply(clean_region)

In [44]:
#spanish regions
df_inequality_im_ed[df_inequality_im_ed['country_name'] == 'Spain']['cleaned_region'].unique()

array(['east', 'west', 'madrid', 'central', 'south', 'canary islands',
       'galicia', 'principado de asturias', 'cantabria', 'pais vasco',
       'comunidad foral de navarra', 'la rioja', 'aragon',
       'comunidad de madrid', 'castilla y leon', 'la mancha',
       'extremadura', 'cataluna', 'comunidad valenciana', 'illes balears',
       'andalucia', 'region de murcia', 'ciudad autonoma de ceuta',
       'canarias', 'ciudad autonoma de melilla', 'principado de asturia',
       'communidad foral de navarra', 'communidad de madrid',
       'castilla la mancha', 'ciudad autonoma de ceuta y melill'],
      dtype=object)

In [42]:
#im_ed in country spain and region cantabria
df_inequality_im_ed[(df_inequality_im_ed['country_name'] == 'Italy') & (df_inequality_im_ed['cleaned_region'] == 'campania')].sort_values('year')

Unnamed: 0,file,region,year,im_ratio,ed_ratio,country,country_name,cleaned_region
10,it89h,[15]ITF3 - Campania,1989,0.741,0.459,it,Italy,campania
29,it91h,[15]ITF3 - Campania,1991,0.971,0.569,it,Italy,campania
48,it93h,[15]ITF3 - Campania,1993,1.16,0.416,it,Italy,campania
67,it95h,[15]ITF3 - Campania,1995,0.92,0.479,it,Italy,campania
86,it98h,[15]ITF3 - Campania,1998,0.833,0.517,it,Italy,campania
870,it00h,[15]ITF3 - Campania,2000,0.968,0.432,it,Italy,campania
890,it04h,[15]ITF3 - Campania,2004,0.807,0.549,it,Italy,campania
910,it08h,[15]ITF3 - Campania,2008,1.45,0.497,it,Italy,campania
930,it10h,[15]ITF3 - Campania,2010,,,it,Italy,campania
950,it14h,[15]ITF3 - Campania,2014,1.37,0.43,it,Italy,campania


In [232]:

df_inequality.groupby(['country_name', 'cleaned_region', 'year']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,country_name,cleaned_region,year,count
0,Austria,außerfern,1994,1
1,Austria,außerfern,1995,1
2,Austria,außerfern,1996,1
3,Austria,außerfern,1997,1
4,Austria,außerfern,1998,1
...,...,...,...,...
2042,Spain,west,1996,1
2043,Spain,west,1997,1
2044,Spain,west,1998,1
2045,Spain,west,1999,1


In [234]:
df_inequality_im_ed.groupby(['country_name', 'cleaned_region', 'year']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,country_name,cleaned_region,year,count
0,Austria,außerfern,1994,1
1,Austria,außerfern,1995,1
2,Austria,außerfern,1996,1
3,Austria,außerfern,1997,1
4,Austria,außerfern,1998,1
...,...,...,...,...
2042,Spain,west,1996,1
2043,Spain,west,1997,1
2044,Spain,west,1998,1
2045,Spain,west,1999,1


In [43]:
#im_ed in spain
df_inequality_im_ed[df_inequality_im_ed['country_name'] == 'Spain']

Unnamed: 0,file,region,year,im_ratio,ed_ratio,country,country_name,cleaned_region
976,es00h,[1]North-East,2000,,,es,Spain,east
977,es00h,[2]North-West,2000,1.380,0.720,es,Spain,west
978,es00h,[3]Madrid,2000,,,es,Spain,madrid
979,es00h,[4]Central,2000,,,es,Spain,central
980,es00h,[5]East,2000,,,es,Spain,east
...,...,...,...,...,...,...,...,...
356,es99h,[3]Madrid,1999,,,es,Spain,madrid
357,es99h,[4]Central,1999,,,es,Spain,central
358,es99h,[5]East,1999,,,es,Spain,east
359,es99h,[6]South,1999,,,es,Spain,south


In [235]:
df_inequality1 = pd.merge(df_inequality, df_inequality_im_ed, on=["country_name","file", 
                                                                     "cleaned_region", "year"], how="left")

In [236]:
# check unique regions
df_inequality1.groupby(['country_name', 'cleaned_region', 'year']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,country_name,cleaned_region,year,count
0,Austria,außerfern,1994,1
1,Austria,außerfern,1995,1
2,Austria,außerfern,1996,1
3,Austria,außerfern,1997,1
4,Austria,außerfern,1998,1
...,...,...,...,...
2042,Spain,west,1996,1
2043,Spain,west,1997,1
2044,Spain,west,1998,1
2045,Spain,west,1999,1


Find missing inequality data in Spanish and French regions 

In [13]:
df_inequality = pd.merge(df_inequality, df_inequality_im_ed, on=["country_name","file", 
                                                                     "cleaned_region", "year"], how="left")
    
df_inequality = df_inequality[df_inequality['country_name'] != 'Austria']

In [15]:
def return_election_data(df_elections):
    elec_data = []

    for country in df_elections['country'].unique():
        country_df = df_elections[df_elections['country'] == country]
        years = country_df['year'].unique()
        for year in years:
            elec_data.append({'country': country, 'year': year})

    election_df = pd.DataFrame(elec_data)
    election_df['flag'] = 1

    return election_df

In [16]:
election_df = return_election_data(df_elections)

In [22]:
regions = df_inequality[['country_name', 'cleaned_region']]
regions.drop_duplicates(inplace=True)
region_join = regions.merge(election_df, left_on=['country_name'], right_on=['country'], how='left')
inequality_with_elections = df_inequality.merge(region_join, left_on=['country_name', 'cleaned_region', 'year'],
                                                    right_on=['country_name', 'cleaned_region', 'year'], how='outer')

inequality_with_elections['flag'].fillna(0, inplace=True)

df = inequality_with_elections[['country_name', 'cleaned_region', 'year', 'avg_gini', 'im_ratio', 'ed_ratio',
                                    'flag', 'country_y']]
    
df['interp_gini'] = df.sort_values(by=['cleaned_region', 'year'])['avg_gini'].transform(lambda x: x.interpolate())
df['interp_im'] = df.sort_values(by=['cleaned_region', 'year'])['im_ratio'].transform(lambda x: x.interpolate())
df['interp_ed'] = df.sort_values(by=['cleaned_region', 'year'])['ed_ratio'].transform(lambda x: x.interpolate())
    
df.sort_values(['country_name', 'cleaned_region', 'year'], inplace=True)
df['delta_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_gini'].diff()
df['avg_gini_period'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_gini'].transform(lambda x: x.rolling(len(x), min_periods=1).mean())

df['delta_im'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_im'].diff()
df['avg_im_period'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_im'].transform(lambda x: x.rolling(len(x), min_periods=1).mean())

df['delta_ed'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_ed'].diff()
df['avg_ed_period'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_ed'].transform(lambda x: x.rolling(len(x), min_periods=1).mean())

election_years_df = df[df['flag'] == 1].copy()
election_years_df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regions.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['interp_gini'] = df.sort_values(by=['cleaned_region', 'year'])['avg_gini'].transform(lambda x: x.interpolate())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['interp_im'] = df.sort_values(by=['cleaned_region', 'year'])['im_ratio'].transform(lambda x: x.interpolate

In [25]:
#find numbers for basilicata
election_years_df[(election_years_df['country_name'] == 'Italy') & (election_years_df['cleaned_region'] == 'basilicata')]

Unnamed: 0,country_name,cleaned_region,year,avg_gini,im_ratio,ed_ratio,flag,country_y,interp_gini,interp_im,interp_ed,delta_gini,avg_gini_period,delta_im,avg_im_period,delta_ed,avg_ed_period
301,Italy,basilicata,1992,,,,1.0,,0.2965,0.9468,0.3375,,0.2965,,0.9468,,0.3375
302,Italy,basilicata,1994,,,,1.0,,0.3325,0.5236,0.356,0.036,0.3145,-0.4232,0.7352,0.0185,0.34675
303,Italy,basilicata,1996,,,,1.0,,0.3415,0.362,0.5105,0.009,0.3235,-0.1616,0.6108,0.1545,0.401333
304,Italy,basilicata,2001,,,,1.0,,0.3465,1.29,0.459,0.005,0.32925,0.928,0.7806,-0.0515,0.41575
305,Italy,basilicata,2006,,,,1.0,,0.3475,1.24,0.424,0.001,0.3329,-0.05,0.87248,-0.035,0.4174
306,Italy,basilicata,2008,0.335,1.17,0.458,1.0,it,0.335,1.17,0.458,-0.0125,0.33325,-0.07,0.922067,0.034,0.424167
307,Italy,basilicata,2013,,,,1.0,,0.326,0.881,0.6135,-0.009,0.332214,-0.289,0.9162,0.1555,0.451214
308,Italy,basilicata,2018,,,,1.0,,0.484,1.82,0.4395,0.158,0.351187,0.939,1.029175,-0.174,0.44975


work through the big join in multi_join

In [1]:
import pandas as pd
import numpy as np
import re
import os 
import argparse

from fuzzywuzzy import fuzz,process
from tqdm import tqdm
import pycountry
import warnings 
from unidecode import unidecode

In [8]:
electoral_folder_relative = os.path.join('..', '..', 'data', 'raw','national_election','eu_ned_national_nuts2(1).csv')
electoral_data_path = os.path.abspath(electoral_folder_relative)

ineq_folder_relative = os.path.join('..', '..', 'data', 'raw','lissy','multination_gini.txt')
inequality_data_path = os.path.abspath(ineq_folder_relative)

results_folder_relative = os.path.join('..', '..', 'data', 'cleaned','national','joined_electoral_lissy.csv')
results_data_path = os.path.abspath(results_folder_relative)

ineq_im_ed_folder_relative = os.path.join('..', '..', 'data', 'raw', 'lissy', 'multination_im_ed.txt')
ineq_im_ed_data_path = os.path.abspath(ineq_im_ed_folder_relative)

unemployment_data_path = os.path.abspath(os.path.join('..', '..', 'data', 'raw', 'lissy', 'unemployment.txt'))
df_unemployment = pd.read_fwf(unemployment_data_path, skiprows=3, header=None,
                              names=["file", "region", "year", "unemployment_rate"])

occupation_data_path = os.path.abspath(os.path.join('..', '..', 'data', 'raw', 'lissy', 'occupation.txt'))
df_occupation = pd.read_fwf(occupation_data_path, skiprows=3, header=None,
                            names=["file", "region", "year", "occupation_rate"])

df_elections = pd.read_csv(electoral_data_path, delimiter=",", encoding='UTF-8')

df_inequality = pd.read_fwf(inequality_data_path, skiprows=3, header=None,
                                names=["file","region", "year", "avg_gini"])

df_inequality_im_ed = pd.read_fwf(ineq_im_ed_data_path, skiprows=3, header=None,
                                    names=["file", "region", "year", "im_ratio", "ed_ratio"])

df_unemployment = pd.read_fwf(unemployment_data_path, skiprows=3, header=None,
                                    names=["file", "region", "year",'unemployment','immig'])
df_occupation = pd.read_fwf(occupation_data_path, skiprows=3, header=None,
                                    names=["file", "region", "year",'wage_ratio'])


In [45]:
def clean_inequality_region(df):
    df.reset_index(drop=True, inplace=True)
    
    df.loc[df['country_name'] == 'Spain', 'cleaned_region'] = df['region'].str.lower().apply(lambda x: re.sub(r'\[\d+\](.+)', r'\1', x))
    df.loc[df['country_name'] != 'Spain', 'cleaned_region'] = df['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))
    df['cleaned_region'] = df['cleaned_region'].str.strip()

    return df

def clean_region(region):
    cleaned_region = re.sub(r'^\[\d+\](.+)', r'\1', region)
    cleaned_region = re.sub(r'^.*-', '', cleaned_region)
    cleaned_region = unidecode(cleaned_region)
    
    return cleaned_region.strip()

def get_country_name(abbreviation):
    try:
        country = pycountry.countries.get(alpha_2=abbreviation)
        return country.name
    except AttributeError:
        return None
    

def clean_im_ed_region(df):
    df.reset_index(drop=True, inplace=True)

    df['cleaned_region'] = df['region'].str.lower().apply(lambda x: re.sub(r'\[\d+\](.+)', r'\1', x))
    df['cleaned_region'] = df['cleaned_region'].str.strip()

    return df


def map_regions(df, df2):
    region_mapping = {}
    
    for region1 in tqdm(df['cleaned_region'], desc="Mapping regions"):
        match, score = find_best_match(region1, df2['cleaned_region'])

        if score >= 70:
            region_mapping[region1] = match
        else:
            region_mapping[region1] = region1  

    df['predicted_region'] = df['cleaned_region'].map(region_mapping)

    return df

def find_best_match(region, choices):
    match, score, index = process.extractOne(region, choices, scorer=fuzz.ratio)
    return match, score


In [61]:
manual_region_mapping = {
    # France
    'france': 'ile-de-france',
    'alpes': "provence-alpes-côte d’azur",
    "d'azur": "provence-alpes-côte d’azur",
    # Spain
    'madrid': 'comunidad de madrid',
    'la mancha': 'castilla-la mancha',
    # Italy
    'friuli': 'friuli-venezia giulia',
    "valle d'aosta": "valle d'aosta/vallée d'aoste",

}

In [30]:
def clean_and_prepare(df_inequality, df_elections, df_inequality_im_ed, df_unemployment, df_occupation):
    df_inequality['country'] = df_inequality['file'].str[:2]
    df_inequality['country_name'] = df_inequality['country'].apply(get_country_name)

    df_inequality_im_ed['country'] = df_inequality_im_ed['file'].str[:2]
    df_inequality_im_ed['country_name'] = df_inequality_im_ed['country'].apply(get_country_name)

    df_unemployment['country'] = df_unemployment['file'].str[:2]
    df_unemployment['country_name'] = df_unemployment['country'].apply(get_country_name)

    df_occupation['country'] = df_occupation['file'].str[:2]
    df_occupation['country_name'] = df_occupation['country'].apply(get_country_name)

    df_inequality.dropna(subset=['region'], inplace=True)
    df_elections.dropna(subset=['regionname'], inplace=True)
    df_inequality_im_ed.dropna(subset=['region'], inplace=True)
    df_unemployment.dropna(subset=['region'], inplace=True)
    df_occupation.dropna(subset=['region'], inplace=True)

    df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(clean_region)
    df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')
    df_inequality_im_ed['cleaned_region'] = df_inequality_im_ed['region'].str.lower().apply(clean_region)
    df_unemployment['cleaned_region'] = df_unemployment['region'].str.lower().apply(clean_region)
    df_occupation['cleaned_region'] = df_occupation['region'].str.lower().apply(clean_region)

    df_inequality = df_inequality.drop(['country','region'], axis=1)
    df_inequality_im_ed = df_inequality_im_ed.drop(['country','region'], axis=1)
    df_unemployment = df_unemployment.drop(['country','region'], axis=1)
    df_occupation = df_occupation.drop(['country','region'], axis=1)

    df_inequality = pd.merge(df_inequality, df_inequality_im_ed, on=["country_name","file", 
                                                                     "cleaned_region", "year"], how="left")
    df_inequality = pd.merge(df_inequality, df_unemployment, on=["country_name","file",
                                                                        "cleaned_region", "year"], how="left")
    df_inequality = pd.merge(df_inequality, df_occupation, on=["country_name","file",
                                                                        "cleaned_region", "year"], how="left")
    
    df_inequality = df_inequality[df_inequality['country_name'] != 'Austria']
    df_inequality = df_inequality[df_inequality['country_name'] != 'Belgium']

    return df_inequality, df_elections



In [32]:
df_inequality,df_elections = clean_and_prepare(df_inequality, df_elections, df_inequality_im_ed, df_unemployment, df_occupation)

In [33]:
def return_election_data(df_elections):
    elec_data = []

    for country in df_elections['country'].unique():
        country_df = df_elections[df_elections['country'] == country]
        years = country_df['year'].unique()
        for year in years:
            elec_data.append({'country': country, 'year': year})

    election_df = pd.DataFrame(elec_data)
    election_df['flag'] = 1

    return election_df


In [36]:
election_df = return_election_data(df_elections)

In [62]:
def make_join_df(df_inequality, election_df):

    regions = df_inequality[['country_name', 'cleaned_region']]
    regions.drop_duplicates(inplace=True)
    region_join = regions.merge(election_df, left_on=['country_name'], right_on=['country'], how='left')
    inequality_with_elections = df_inequality.merge(region_join, left_on=['country_name', 'cleaned_region', 'year'],
                                                    right_on=['country_name', 'cleaned_region', 'year'], how='outer')

    inequality_with_elections['flag'].fillna(0, inplace=True)

    df = inequality_with_elections[['country_name', 'cleaned_region', 'year', 'avg_gini', 'im_ratio', 'ed_ratio',
                                    'flag']]
    
    df['interp_gini'] = df.sort_values(by=['cleaned_region', 'year'])['avg_gini'].transform(lambda x: x.interpolate())
    df['interp_im'] = df.sort_values(by=['cleaned_region', 'year'])['im_ratio'].transform(lambda x: x.interpolate())
    df['interp_ed'] = df.sort_values(by=['cleaned_region', 'year'])['ed_ratio'].transform(lambda x: x.interpolate())
    
    df.sort_values(['country_name', 'cleaned_region', 'year'], inplace=True)
    df['delta_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_gini'].diff()
    df['avg_gini_period'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_gini'].transform(lambda x: x.rolling(len(x), min_periods=1).mean())

    df['delta_im'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_im'].diff()
    df['avg_im_period'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_im'].transform(lambda x: x.rolling(len(x), min_periods=1).mean())

    df['delta_ed'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_ed'].diff()
    df['avg_ed_period'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['interp_ed'].transform(lambda x: x.rolling(len(x), min_periods=1).mean())

    election_years_df = df[df['flag'] == 1].copy()
    election_years_df.reset_index(drop=True, inplace=True)

    election_years_df = map_regions(election_years_df, df_elections)
    election_years_df['predicted_region'] = election_years_df['predicted_region'].replace(manual_region_mapping)

    joined_data = election_years_df.merge(df_elections, left_on=['country_name', 'predicted_region', 'year'], right_on=['country', 'cleaned_region', 'year'], how='left')

    return election_years_df, df_elections


In [63]:
join1,join2 = make_join_df(df_inequality, election_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regions.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['interp_gini'] = df.sort_values(by=['cleaned_region', 'year'])['avg_gini'].transform(lambda x: x.interpolate())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['interp_im'] = df.sort_values(by=['cleaned_region', 'year'])['im_ratio'].transform(lambda x: x.interpolate

In [68]:
join1[join1['country_name'] == 'France']['predicted_region'].unique()

array(['provence-alpes-côte d’azur', 'alsace', 'aquitaine', 'ardenne',
       'auvergne', 'bourgogne', 'bretagne', 'centru', 'poitou-charentes',
       'comte', 'corse', 'ile-de-france', 'limousin', 'lorraine',
       'basse-normandie ', 'nord-pas de calais', 'pays de la loire',
       'picardie', 'pyrenees', 'roussillon'], dtype=object)

In [67]:
join2[join2['country'] == 'France']['cleaned_region'].unique()

array(['ile-de-france', 'centre - val de loire', 'bourgogne',
       'franche-comté', 'basse-normandie ', 'haute-normandie ',
       'nord-pas de calais', 'picardie', 'alsace', 'champagne-ardenne',
       'lorraine', 'pays de la loire', 'bretagne', 'aquitaine',
       'limousin', 'poitou-charentes', 'languedoc-roussillon',
       'midi-pyrénées', 'auvergne', 'rhône-alpes',
       'provence-alpes-côte d’azur', 'corse', 'guadeloupe', 'martinique ',
       'guyane', 'la réunion ', 'mayotte'], dtype=object)

In [69]:
missing_regions[missing_regions['country_name'] == 'France']['cleaned_region'].unique()

array(['ardenne', 'comte', 'pyrenees', 'roussillon'], dtype=object)

In [64]:
missing_regions = join1[~join1['predicted_region'].isin(join2['cleaned_region'])]

In [66]:
missing_regions

Unnamed: 0,country_name,cleaned_region,year,avg_gini,im_ratio,ed_ratio,flag,interp_gini,interp_im,interp_ed,delta_gini,avg_gini_period,delta_im,avg_im_period,delta_ed,avg_ed_period,predicted_region
18,France,ardenne,1993,,,,1.0,0.2935,1.350,0.6715,,0.293500,,1.350000,,0.671500,ardenne
19,France,ardenne,1997,0.302,1.15,0.612,1.0,0.3020,1.150,0.6120,0.0085,0.297750,-0.200,1.250000,-0.0595,0.641750,ardenne
20,France,ardenne,2002,0.328,1.19,0.584,1.0,0.3280,1.190,0.5840,0.0260,0.307833,0.040,1.230000,-0.0280,0.622500,ardenne
21,France,ardenne,2007,0.345,1.33,0.620,1.0,0.3450,1.330,0.6200,0.0170,0.317125,0.140,1.255000,0.0360,0.621875,ardenne
22,France,ardenne,2012,0.308,1.38,0.578,1.0,0.3080,1.380,0.5780,-0.0370,0.315300,0.050,1.280000,-0.0420,0.613100,ardenne
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,Italy,trentino,2001,,,,1.0,0.3045,1.280,0.5910,-0.0470,0.322125,0.300,1.023375,0.1500,0.547875,trentino
461,Italy,trentino,2006,,,,1.0,0.3130,1.580,0.6275,0.0085,0.320300,0.300,1.134700,0.0365,0.563800,trentino
462,Italy,trentino,2008,0.306,1.84,0.760,1.0,0.3060,1.840,0.7600,-0.0070,0.317917,0.260,1.252250,0.1325,0.596500,trentino
463,Italy,trentino,2013,,,,1.0,0.2770,1.525,0.6325,-0.0290,0.312071,-0.315,1.291214,-0.1275,0.601643,trentino
