In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import pycountry

In [4]:
import re

In [43]:
from fuzzywuzzy import fuzz



In [10]:
import os 

In [5]:
def clean_and_join_data(electoral_data_path, inequality_data_path, similarity_threshold=0.7):
    # Step 1: Import electoral data
    df_elections = pd.read_csv(electoral_data_path, delimiter=",", encoding='UTF-8')

    # Step 2: Import inequality data
    df_inequality = pd.read_fwf(inequality_data_path, skiprows=3, header=None,
                                names=["region", "year", "avg_gini"])

    # Step 3: Clean region names for both datasets
    df_inequality['cleaned_region'] = df_inequality['region'].str.lower().str.replace('[^\w\s]', '')
    df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

    # Calculate TF-IDF vectors and cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    X_inequality = tfidf_vectorizer.fit_transform(df_inequality['cleaned_region'])
    X_elections = tfidf_vectorizer.transform(df_elections['cleaned_region'])
    similarity_matrix = cosine_similarity(X_elections, X_inequality)

    # Initialize lists to store predicted matches and confidence scores
    predicted_matches = []
    confidence_scores = []

    # Step 4: Iterate through electoral data and find matches
    for i in range(len(df_elections)):
        best_match_index = np.argmax(similarity_matrix[i])
        best_similarity_score = similarity_matrix[i][best_match_index]

        if best_similarity_score > similarity_threshold:
            predicted_match = df_inequality.iloc[best_match_index]['region']
        else:
            predicted_match = None

        predicted_matches.append(predicted_match)
        confidence_scores.append(best_similarity_score)

    df_elections['predicted_region'] = predicted_matches
    df_elections['confidence_score'] = confidence_scores

    # Step 5: Merge the two datasets based on predicted regions and years
    joined_data = df_elections.merge(df_inequality, left_on=['predicted_region', 'year'],
                                     right_on=['region', 'year'], how='left')

    return joined_data

In [6]:
def get_country_name(abbreviation):
    try:
        country = pycountry.countries.get(alpha_2=abbreviation)
        return country.name
    except AttributeError:
        return None

In [13]:
save_folder_relative = os.path.join('..', '..', 'data', 'raw','national_election','eu_ned_national_nuts2(1).csv')
electoral_data_path = os.path.abspath(save_folder_relative)

In [326]:
df_elections = pd.read_csv(electoral_data_path, delimiter=",", encoding='UTF-8')


In [17]:
ineq_folder_relative = os.path.join('..', '..', 'data', 'raw','lissy','multination_gini.txt')
inequality_data_path = os.path.abspath(ineq_folder_relative)

In [327]:

df_inequality = pd.read_fwf(inequality_data_path, skiprows=3, header=None,
                                names=["file","region", "year", "avg_gini"])

In [328]:
df_inequality['country'] = df_inequality['file'].str[:2]

In [329]:
df_inequality['country_name'] = df_inequality['country'].apply(get_country_name)

In [330]:
df_inequality.dropna(subset=['region'], inplace=True)
df_elections.dropna(subset=['regionname'], inplace=True)

In [179]:
df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))

df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

In [38]:
    # Calculate TF-IDF vectors and cosine similarity
tfidf_vectorizer = TfidfVectorizer()
X_inequality = tfidf_vectorizer.fit_transform(df_inequality['cleaned_region'])
X_elections = tfidf_vectorizer.transform(df_elections['cleaned_region'])
similarity_matrix = cosine_similarity(X_elections, X_inequality)

    # Initialize lists to store predicted matches and confidence scores
predicted_matches = []
confidence_scores = []

In [36]:
similarity_threshold = 0.7

In [39]:
for i in range(len(df_inequality)):
    best_match_index = np.argmax(similarity_matrix[i])
    best_similarity_score = similarity_matrix[i][best_match_index]

    # Print relevant information for debugging
    print(f"Index {i}:")
    print(f"   Original Region (df_inequality): {df_inequality.iloc[i]['cleaned_region']}")
    print(f"   Best Match Index: {best_match_index}")
    print(f"   Best Similarity Score: {best_similarity_score}")

    if best_similarity_score > similarity_threshold:
        predicted_match = df_elections.iloc[best_match_index]['cleaned_region']
        print(f"   Predicted Match (df_elections): {predicted_match}")
    else:
        predicted_match = None
        print("   No Match Found")

    predicted_matches.append(predicted_match)
    confidence_scores.append(best_similarity_score)

df_inequality['predicted_region'] = predicted_matches
df_inequality['confidence_score'] = confidence_scores

Index 0:
   Original Region (df_inequality):  nordburgenland
   Best Match Index: 231
   Best Similarity Score: 1.0
   Predicted Match (df_elections): kärnten
Index 1:
   Original Region (df_inequality):  südburgenland
   Best Match Index: 231
   Best Similarity Score: 1.0
   Predicted Match (df_elections): kärnten
Index 2:
   Original Region (df_inequality): eisenwurzen
   Best Match Index: 231
   Best Similarity Score: 1.0
   Predicted Match (df_elections): kärnten
Index 3:
   Original Region (df_inequality): süd
   Best Match Index: 231
   Best Similarity Score: 1.0
   Predicted Match (df_elections): kärnten
Index 4:
   Original Region (df_inequality):  sankt pölten
   Best Match Index: 231
   Best Similarity Score: 1.0
   Predicted Match (df_elections): kärnten
Index 5:
   Original Region (df_inequality):  waldviertel
   Best Match Index: 231
   Best Similarity Score: 1.0
   Predicted Match (df_elections): kärnten
Index 6:
   Original Region (df_inequality):  weinviertel
   Best Ma

In [40]:
def levenshtein_similarity(s1, s2):
    return fuzz.token_set_ratio(s1, s2) / 100.0

In [41]:
predicted_matches = []
confidence_scores = []

In [44]:
similarity_threshold = 0.7  # Adjust the threshold as needed

for i in range(len(df_inequality)):
    best_match_index = np.argmax([levenshtein_similarity(df_inequality.iloc[i]['cleaned_region'], region) for region in df_elections['cleaned_region']])
    best_similarity_score = levenshtein_similarity(df_inequality.iloc[i]['cleaned_region'], df_elections.iloc[best_match_index]['cleaned_region'])

    # Print relevant information for debugging
    print(f"Index {i}:")
    print(f"   Original Region (df_inequality): {df_inequality.iloc[i]['cleaned_region']}")
    print(f"   Best Match Index: {best_match_index}")
    print(f"   Best Similarity Score: {best_similarity_score}")

    if best_similarity_score > similarity_threshold:
        predicted_match = df_elections.iloc[best_match_index]['cleaned_region']
        print(f"   Predicted Match (df_elections): {predicted_match}")
    else:
        predicted_match = None
        print("   No Match Found")

    predicted_matches.append(predicted_match)
    confidence_scores.append(best_similarity_score)

df_inequality['predicted_region'] = predicted_matches
df_inequality['confidence_score'] = confidence_scores

Index 0:
   Original Region (df_inequality):  nordburgenland
   Best Match Index: 0
   Best Similarity Score: 0.83
   Predicted Match (df_elections): burgenland
Index 1:
   Original Region (df_inequality):  südburgenland
   Best Match Index: 0
   Best Similarity Score: 0.91
   Predicted Match (df_elections): burgenland
Index 2:
   Original Region (df_inequality): eisenwurzen
   Best Match Index: 7826
   Best Similarity Score: 0.44
   No Match Found
Index 3:
   Original Region (df_inequality): süd
   Best Match Index: 2506
   Best Similarity Score: 0.44
   No Match Found
Index 4:
   Original Region (df_inequality):  sankt pölten
   Best Match Index: 15910
   Best Similarity Score: 0.52
   No Match Found
Index 5:
   Original Region (df_inequality):  waldviertel
   Best Match Index: 4938
   Best Similarity Score: 0.53
   No Match Found
Index 6:
   Original Region (df_inequality):  weinviertel
   Best Match Index: 15174
   Best Similarity Score: 0.5
   No Match Found
Index 7:
   Original R

KeyboardInterrupt: 

In [45]:
import multiprocessing

In [None]:
def levenshtein_similarity(s1, s2):
    return fuzz.token_set_ratio(s1, s2) / 100.0


In [46]:
countries = df_inequality['country'].unique()

In [47]:
manager = multiprocessing.Manager()
predicted_matches = manager.list()
confidence_scores = manager.list()

In [48]:
def calculate_distances(country_data):
    df_elections, df_inequality = country_data

    similarity_threshold = 0.7

    local_predicted_matches = []
    local_confidence_scores = []

    for i in range(len(df_inequality)):
        best_match_index = np.argmax([levenshtein_similarity(df_inequality.iloc[i]['cleaned_region'], region) for region in df_elections['cleaned_region']])
        best_similarity_score = levenshtein_similarity(df_inequality.iloc[i]['cleaned_region'], df_elections.iloc[best_match_index]['cleaned_region'])

        if best_similarity_score > similarity_threshold:
            predicted_match = df_elections.iloc[best_match_index]['cleaned_region']
        else:
            predicted_match = None

        local_predicted_matches.append(predicted_match)
        local_confidence_scores.append(best_similarity_score)

    # Append local results to shared lists
    predicted_matches.extend(local_predicted_matches)
    confidence_scores.extend(local_confidence_scores)

In [None]:
countries = [(df_elections_country_data, df_inequality_country_data) for df_elections_country_data, df_inequality_country_data in zip(list_of_df_elections_data, list_of_df_inequality_data)]

In [26]:
joined_data = df_elections.merge(df_inequality, left_on=['country','regionname', 'year'],
                                right_on=['country_name','predicted_region', 'year'], how='left')

In [33]:
df_inequality[df_inequality['country_name'] == 'Italy'][['country_name','region', 'predicted_region','cleaned_region', 'confidence_score']]

Unnamed: 0,country_name,region,predicted_region,cleaned_region,confidence_score
864,Italy,[10]ITI2 - Umbria,,umbria,0.0
865,Italy,[11]ITI3 - Marche,,marche,0.0
866,Italy,[12]ITI4 - Lazio,,lazio,0.0
867,Italy,[13]ITF1 - Abruzzo,,abruzzo,0.0
868,Italy,[14]ITF2 - Molise,,molise,0.0
...,...,...,...,...,...
95,Italy,[5]ITH3 - Veneto,,veneto,0.0
96,Italy,[6]ITH4 - Friuli,,friuli,0.0
97,Italy,[7]ITC3 - Liguria,,liguria,0.0
98,Italy,[8]ITH5 - Emilia Romagna,,emilia romagna,0.0


In [28]:
df_elections[df_elections['country'] == 'Italy']['cleaned_region'].value_counts()

cleaned_region
puglia                                 115
campania                               113
sardegna                               111
veneto                                 110
sicilia                                109
calabria                               108
lombardia                              108
toscana                                107
piemonte                               103
abruzzo                                102
provincia autonoma di bolzano/bozen    101
emilia-romagna                         101
molise                                 100
provincia autonoma di trento           100
liguria                                100
lazio                                  100
friuli-venezia giulia                   99
basilicata                              98
marche                                  96
umbria                                  95
valle d'aosta/vallée d'aoste            52
Name: count, dtype: int64

In [63]:
joined_data.dropna(subset=['avg_gini'], inplace=True)

In [66]:
joined_data.columns

Index(['country_x', 'country_code', 'nutslevel', 'nuts2', 'regionname', 'type',
       'year', 'party_abbreviation', 'party_english', 'party_native',
       'partyfacts_id', 'partyvote', 'electorate', 'totalvote', 'validvote',
       'cleaned_region_x', 'file', 'region', 'avg_gini', 'country_y',
       'country_name', 'cleaned_region_y', 'predicted_region',
       'confidence_score'],
      dtype='object')

In [74]:
joined_data[joined_data['country_name'] == 'France'][['country_name','regionname', 'region', 'cleaned_region_y','cleaned_region_x','predicted_region','confidence_score','year','avg_gini']]


Unnamed: 0,country_name,regionname,region,cleaned_region_y,cleaned_region_x,predicted_region,confidence_score,year,avg_gini


In [75]:
joined_data['country_name'].value_counts()

country_name
Austria    255
Name: count, dtype: int64

What if we just do it with minimal cleaning 

In [331]:
df_inequality['cleaned_region'] = df_inequality['region'].str.lower().apply(lambda x: re.sub(r'^.*-', '', x))
df_elections['cleaned_region'] = df_elections['regionname'].str.lower().str.replace('[^\w\s]', '')

In [332]:
df_inequality['cleaned_region'] = df_inequality['cleaned_region'].str.strip()

In [116]:
simple_join = df_inequality.merge(df_elections, left_on=['country_name','cleaned_region', 'year'], right_on=['country','cleaned_region', 'year'], how='left')

In [95]:
simple_join_t = df_inequality.merge(df_elections, left_on=['country_name', 'year'], right_on=['country', 'year'], how='left')

In [101]:
simple_join_t.columns

Index(['file', 'region', 'year', 'avg_gini', 'country_x', 'country_name',
       'cleaned_region_x', 'country_y', 'country_code', 'nutslevel', 'nuts2',
       'regionname', 'type', 'party_abbreviation', 'party_english',
       'party_native', 'partyfacts_id', 'partyvote', 'electorate', 'totalvote',
       'validvote', 'cleaned_region_y'],
      dtype='object')

In [104]:
simple_join_t['country_y'].value_counts()

country_y
France     42365
Germany    41290
Spain      25203
Austria     8627
Italy       3840
Belgium     1323
Name: count, dtype: int64

In [79]:
simple_join.dropna(subset=['party_native'], inplace=True)

In [120]:
simple_join[simple_join['country_y'].isna()][['country_name','country_x','country_y','cleaned_region','year','party_native','avg_gini']]

Unnamed: 0,country_name,country_x,country_y,cleaned_region,year,party_native,avg_gini
0,Austria,at,,nordburgenland,2000,,0.266
1,Austria,at,,südburgenland,2000,,0.246
2,Austria,at,,eisenwurzen,2000,,0.261
3,Austria,at,,süd,2000,,0.254
4,Austria,at,,sankt pölten,2000,,0.256
...,...,...,...,...,...,...,...
2763,Italy,it,,veneto,1998,,0.322
2764,Italy,it,,friuli,1998,,0.310
2765,Italy,it,,liguria,1998,,0.321
2766,Italy,it,,emilia romagna,1998,,0.315


In [180]:
countries_to_keep = df_inequality['country_name'].unique()

In [351]:
df_elections = df_elections[df_elections['country'].isin(countries_to_keep)]

In [158]:
for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    print(f"{country}: {years}")

Austria: [1990 1994 1995 2002 2006 2008 2013 2017 2019]
Belgium: [1991 1995 1999 2003 2007 2010 2014 2019]
France: [1993 1997 2002 2007 2012 2017]
Germany: [1990 1994 1998 2002 2005 2009 2013 2017]
Italy: [1992 1994 1996 2001 2006 2008 2013 2018]
Spain: [1993 1996 2000 2004 2008 2011 2015 2016 2019]


In [164]:
elec_country = []
elec_years = []

for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    elec_country.append(country)
    elec_years.append(years)

election_df = pd.DataFrame({'country': elec_country, 'years': elec_years})

In [352]:
elec_data = []

for country in df_elections['country'].unique():
    country_df = df_elections[df_elections['country'] == country]
    years = country_df['year'].unique()
    for year in years:
        elec_data.append({'country': country, 'year': year})

election_df = pd.DataFrame(elec_data)

In [353]:
election_df['flag'] = 1

In [354]:
inequality_with_elections = df_inequality.merge(election_df, left_on=['country_name', 'year'], right_on=['country', 'year'], how='outer')

In [355]:
inequality_with_elections['flag'].fillna(0, inplace=True)

In [360]:
df = inequality_with_elections[['country_name', 'cleaned_region', 'year', 'avg_gini', 'flag','country_y']]

In [362]:
df['country_name'].fillna(df['country_y'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['country_name'].fillna(df['country_y'], inplace=True)


In [363]:
df

Unnamed: 0,country_name,cleaned_region,year,avg_gini,flag,country_y
0,Austria,nordburgenland,2000,0.266,0.0,
1,Austria,südburgenland,2000,0.246,0.0,
2,Austria,eisenwurzen,2000,0.261,0.0,
3,Austria,süd,2000,0.254,0.0,
4,Austria,sankt pölten,2000,0.256,0.0,
...,...,...,...,...,...,...
2086,Italy,,1996,,1.0,Italy
2087,Italy,,2001,,1.0,Italy
2088,Italy,,2006,,1.0,Italy
2089,Italy,,2013,,1.0,Italy


In [364]:
df.sort_values(['country_name', 'cleaned_region', 'year'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(['country_name', 'cleaned_region', 'year'], inplace=True)


In [365]:
df['gini_since'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.expanding().mean().shift(fill_value=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gini_since'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.expanding().mean().shift(fill_value=0))


In [366]:
df['change_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.diff().cumsum().shift(fill_value=0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['change_gini'] = df.groupby(['country_name', 'cleaned_region', 'flag'])['avg_gini'].transform(lambda x: x.diff().cumsum().shift(fill_value=0))


In [367]:
election_years_df = df[df['flag'] == 1]

In [368]:
election_years_df.reset_index(drop=True, inplace=True)

In [369]:
joined_data = election_years_df.merge(df_elections, left_on=['country_name', 'cleaned_region', 'year'], right_on=['country', 'cleaned_region', 'year'], how='left')

In [370]:
joined_data.dropna(subset=['party_native'], inplace=True)   

In [371]:
joined_data['country_name'].value_counts()

country_name
France     537
Germany    251
Italy      125
Austria     36
Name: count, dtype: int64

In [372]:
election_years_df.groupby('country_name')['year'].unique()

country_name
Austria    [2006, 2008, 2013, 2017, 2019, 1994, 1995, 199...
Belgium     [1995, 2003, 2007, 2010, 2014, 1991, 1999, 2019]
France                  [1997, 2002, 2007, 2012, 2017, 1993]
Germany     [1994, 1998, 2002, 2005, 2009, 2013, 2017, 1990]
Italy       [2008, 1992, 1994, 1996, 2001, 2006, 2013, 2018]
Spain      [2004, 2008, 2011, 2015, 2016, 2019, 1993, 199...
Name: year, dtype: object

In [373]:
df_elections.groupby('country')['year'].unique()

country
Austria    [1990, 1994, 1995, 2002, 2006, 2008, 2013, 201...
Belgium     [1991, 1995, 1999, 2003, 2007, 2010, 2014, 2019]
France                  [1993, 1997, 2002, 2007, 2012, 2017]
Germany     [1990, 1994, 1998, 2002, 2005, 2009, 2013, 2017]
Italy       [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
Spain      [1993, 1996, 2000, 2004, 2008, 2011, 2015, 201...
Name: year, dtype: object

In [256]:
def compare_dfs(country):
    country_elections = df_elections[df_elections['country'] == country]
    country_eydf = election_years_df[election_years_df['country_name'] == country]
    print(country_elections.groupby('cleaned_region')['year'].unique(), country_eydf.groupby('cleaned_region')['year'].unique())

In [259]:
def return_country_dfs(country):
    country_elections = df_elections[df_elections['country'] == country]
    country_eydf = election_years_df[election_years_df['country_name'] == country]
    return country_elections, country_eydf

In [317]:
italy1, italy2 = return_country_dfs('Italy')

In [318]:
italy1 = italy1[['country', 'cleaned_region', 'year', 'party_native']]

In [283]:
italy1.groupby('cleaned_region')['year'].unique()

cleaned_region
abruzzo                                [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
basilicata                             [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
calabria                               [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
campania                               [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
emilia-romagna                         [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
friuli-venezia giulia                  [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
lazio                                  [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
liguria                                [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
lombardia                              [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
marche                                 [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
molise                                 [1992, 1994, 1996, 2001, 2006, 2008, 2013, 2018]
piemonte         

In [284]:
italy2.groupby('cleaned_region')['year'].unique()

cleaned_region
 abruzzo           [2008]
 basilicata        [2008]
 calabria          [2008]
 campania          [2008]
 emilia romagna    [2008]
 friuli            [2008]
 lazio             [2008]
 liguria           [2008]
 lombardia         [2008]
 marche            [2008]
 molise            [2008]
 piemonte          [2008]
 puglia            [2008]
 sardegna          [2008]
 sicilia           [2008]
 toscana           [2008]
 trentino          [2008]
 umbria            [2008]
 valle d'aosta     [2008]
 veneto            [2008]
Name: year, dtype: object

In [322]:
join_italy = italy1.merge(italy2, left_on=['cleaned_region', 'year'], right_on=['cleaned_region', 'year'], how='left')

In [323]:
join_italy[join_italy['year'] == 2008]

Unnamed: 0,country,cleaned_region,year,party_native,country_name,avg_gini,flag,gini_since,change_gini
73,Italy,piemonte,2008,Lista di Pietro -- Italia del Valori,Italy,0.276,1.0,0.0,0.0
74,Italy,piemonte,2008,Il Popolo della Liberta,Italy,0.276,1.0,0.0,0.0
75,Italy,piemonte,2008,Fiamma Tricolore,Italy,0.276,1.0,0.0,0.0
76,Italy,piemonte,2008,Sinistra Italiana,Italy,0.276,1.0,0.0,0.0
77,Italy,piemonte,2008,Lega Nord,Italy,0.276,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2099,Italy,lazio,2008,Movimento per l'Autonomia,Italy,0.343,1.0,0.0,0.0
2100,Italy,lazio,2008,,Italy,0.343,1.0,0.0,0.0
2101,Italy,lazio,2008,Partito Democratico,Italy,0.343,1.0,0.0,0.0
2102,Italy,lazio,2008,Partito Socialista Italiano,Italy,0.343,1.0,0.0,0.0


In [319]:
italy2['cleaned_region'] = italy2['cleaned_region'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  italy2['cleaned_region'] = italy2['cleaned_region'].str.strip()


In [320]:
matches = italy1[italy1['cleaned_region'].isin(italy2['cleaned_region'])]
