In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from factor_analyzer import Rotator
from ppca import PPCA
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import matplotlib.colors as mcolors

# Data Preparation

We need to filter down to the three most recent survey waves (from 2005 onwards). The most recent survey waves provide up-to-date information on cultural values, ensuring that the analysis reflects current societal norms and attitudes. We also filter out the ten questions from the IVS that form the basis of the Inglehart-Welzel Cultural Map.


In [2]:
# load ivs_df and country metadata from pkl
ivs_df = pd.read_pickle("../data/ivs_df.pkl")
country_codes = pd.read_pickle("../data/country_codes.pkl")
# Filtering data
# Metadata we need
meta_col = ["S020", "S003"]
# Weights
weights = ["S017"]
# Use the ten questions from the IVS that form the basis of the Inglehart-Welzel Cultural Map
iv_qns = ["A008", "A165", "E018", "E025", "F063", "F118", "F120", "G006", "Y002", "Y003"]
subset_ivs_df = ivs_df[meta_col+weights+iv_qns]
subset_ivs_df = subset_ivs_df.rename(columns={'S020': 'year', 'S003': 'country_code', 'S017': 'weight'})
# remove data from before 2005
# We need to filter down to the three most recent survey waves (from 2005 onwards). The most recent survey waves provide up-to-date information on cultural values, ensuring that the analysis reflects current societal norms and attitudes. We also filter out the ten questions from the IVS that form the basis of the Inglehart-Welzel Cultural Map.
subset_ivs_df = subset_ivs_df[subset_ivs_df["year"] >= 2005]
subset_ivs_df

In [8]:
"hi"

In [9]:
print(len(subset_ivs_df))

The joint IVS data, after filtering, covers 394,524 individual-level survey response observations from 112 countries.

In [None]:
# Scale the Data using the weights
# subset_ivs_df[iv_qns] = subset_ivs_df[iv_qns].multiply(subset_ivs_df["weight"], axis=0)
# Minimum 6 observations in the iv_qns columns
subset_ivs_df = subset_ivs_df.dropna(subset=iv_qns, thresh=6)

# Country Names and Metadata

We also need to collect the country names and metadata for the countries in our dataset. We will use this information to map the countries to their respective cultural regions and to identify Islamic countries.

In [88]:
data = {
    "Country": ["Afghanistan", "Albania", "Algeria", "American Samoa", "Andorra", "Angola", "Anguilla", 
                "Antarctica", "Antigua and Barbuda", "Argentina", "Armenia", "Aruba", "Australia", "Austria", 
                "Azerbaijan", "Bahamas (the)", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", 
                "Belize", "Benin", "Bermuda", "Bhutan", "Bolivia (Plurinational State of)", 
                "Bonaire, Sint Eustatius and Saba", "Bosnia and Herzegovina", "Botswana", "Bouvet Island", 
                "Brazil", "British Indian Ocean Territory (the)", "Brunei Darussalam", "Bulgaria", 
                "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Cayman Islands (the)", 
                "Central African Republic (the)", "Chad", "Chile", "China", "Christmas Island", 
                "Cocos (Keeling) Islands (the)", "Colombia", "Comoros (the)", "Congo (the Democratic Republic of the)", 
                "Congo (the)", "Cook Islands (the)", "Costa Rica", "Croatia", "Cuba", "Curaçao", "Cyprus", 
                "Czechia", "Côte d'Ivoire", "Denmark", "Djibouti", "Dominica", "Dominican Republic (the)", 
                "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", 
                "Ethiopia", "Falkland Islands (the) [Malvinas]", "Faroe Islands (the)", "Fiji", "Finland", 
                "France", "French Guiana", "French Polynesia", "French Southern Territories (the)", "Gabon", 
                "Gambia (the)", "Georgia", "Germany", "Ghana", "Gibraltar", "Greece", "Greenland", "Grenada", 
                "Guadeloupe", "Guam", "Guatemala", "Guernsey", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", 
                "Heard Island and McDonald Islands", "Holy See (the)", "Honduras", "Hong Kong", "Hungary", "Iceland", 
                "India", "Indonesia", "Iran (Islamic Republic of)", "Iraq", "Ireland", "Isle of Man", "Israel", 
                "Italy", "Jamaica", "Japan", "Jersey", "Jordan", "Kazakhstan", "Kenya", "Kiribati", 
                "Korea (the Democratic People's Republic of)", "Korea (the Republic of)", "Kuwait", "Kyrgyzstan", 
                "Lao People's Democratic Republic (the)", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", 
                "Liechtenstein", "Lithuania", "Luxembourg", "Macao", "Madagascar", "Malawi", "Malaysia", "Maldives", 
                "Mali", "Malta", "Marshall Islands (the)", "Martinique", "Mauritania", "Mauritius", "Mayotte", 
                "Mexico", "Micronesia (Federated States of)", "Moldova (the Republic of)", "Monaco", "Mongolia", 
                "Montenegro", "Montserrat", "Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", 
                "Netherlands (the)", "New Caledonia", "New Zealand", "Nicaragua", "Niger (the)", "Nigeria", 
                "Niue", "Norfolk Island", "Northern Mariana Islands (the)", "Norway", "Oman", "Pakistan", "Palau", 
                "Palestine, State of", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines (the)", 
                "Pitcairn", "Poland", "Portugal", "Puerto Rico", "Qatar", "Republic of North Macedonia", "Romania", 
                "Russian Federation (the)", "Rwanda", "Réunion", "Saint Barthélemy", 
                "Saint Helena, Ascension and Tristan da Cunha", "Saint Kitts and Nevis", "Saint Lucia", 
                "Saint Martin (French part)", "Saint Pierre and Miquelon", "Saint Vincent and the Grenadines", "Samoa", 
                "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", 
                "Sierra Leone", "Singapore", "Sint Maarten (Dutch part)", "Slovakia", "Slovenia", "Solomon Islands", 
                "Somalia", "South Africa", "South Georgia and the South Sandwich Islands", "South Sudan", "Spain", 
                "Sri Lanka", "Sudan (the)", "Suriname", "Svalbard and Jan Mayen", "Sweden", "Switzerland", 
                "Syrian Arab Republic", "Taiwan (Province of China)", "Tajikistan", "Tanzania, United Republic of", 
                "Thailand", "Timor-Leste", "Togo", "Tokelau", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", 
                "Turkmenistan", "Turks and Caicos Islands (the)", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates (the)", 
                "United Kingdom of Great Britain and Northern Ireland (the)", "United States Minor Outlying Islands (the)", 
                "United States of America (the)", "Uruguay", "Uzbekistan", "Vanuatu", "Venezuela (Bolivarian Republic of)", 
                "Viet Nam", "Virgin Islands (British)", "Virgin Islands (U.S.)", "Wallis and Futuna", "Western Sahara", 
                "Yemen", "Zambia", "Zimbabwe", "Åland Islands"],
    "Numeric": [4, 8, 12, 16, 20, 24, 660, 10, 28, 32, 51, 533, 36, 40, 31, 44, 48, 50, 52, 112, 56, 84, 204, 60, 64, 
                68, 535, 70, 72, 74, 76, 86, 96, 100, 854, 108, 132, 116, 120, 124, 136, 140, 148, 152, 156, 162, 
                166, 170, 174, 180, 178, 184, 188, 191, 192, 531, 196, 203, 384, 208, 262, 212, 214, 218, 818, 222, 
                226, 232, 233, 748, 231, 238, 234, 242, 246, 250, 254, 258, 260, 266, 270, 268, 276, 288, 292, 300, 
                304, 308, 312, 316, 320, 831, 324, 624, 328, 332, 334, 336, 340, 344, 348, 352, 356, 360, 364, 368, 
                372, 833, 376, 380, 388, 392, 832, 400, 398, 404, 296, 408, 410, 414, 417, 418, 428, 422, 426, 430, 
                434, 438, 440, 442, 446, 450, 454, 458, 462, 466, 470, 584, 474, 478, 480, 175, 484, 583, 498, 492, 
                496, 499, 500, 504, 508, 104, 516, 520, 524, 528, 540, 554, 558, 562, 566, 570, 574, 580, 578, 512, 
                586, 585, 275, 591, 598, 600, 604, 608, 612, 616, 620, 630, 634, 807, 642, 643, 646, 638, 652, 654, 
                659, 662, 663, 666, 670, 882, 674, 678, 682, 686, 688, 690, 694, 702, 534, 703, 705, 90, 706, 710, 
                239, 728, 724, 144, 729, 740, 744, 752, 756, 760, 158, 762, 834, 764, 626, 768, 772, 776, 780, 788, 
                792, 795, 796, 798, 800, 804, 784, 826, 581, 840, 858, 860, 548, 862, 704, 92, 850, 876, 732, 887, 
                894, 716, 248]
}

country_codes = pd.DataFrame(data)
country_codes

In [87]:
unique_countries = subset_ivs_df["country_code"].unique()
unique_countries

In [60]:
country_codes = country_codes[country_codes["Numeric"].isin(unique_countries)]
country_codes

In [89]:
# Adding cultural regions for the regions in our dataset

cultural_regions = {
    'Albania': 'Orthodox Europe',
    'Algeria': 'African-Islamic',
    'Andorra': 'Catholic Europe',
    'Argentina': 'Latin America',
    'Armenia': 'Orthodox Europe',
    'Australia': 'English-Speaking',
    'Austria': 'Catholic Europe',
    'Azerbaijan': 'Orthodox Europe',
    'Bangladesh': 'West & South Asia',
    'Belarus': 'Orthodox Europe',
    'Belgium': 'Catholic Europe',
    'Bolivia (Plurinational State of)': 'Latin America',
    'Bosnia and Herzegovina': 'Orthodox Europe',
    'Brazil': 'Latin America',
    'Bulgaria': 'Orthodox Europe',
    'Burkina Faso': 'African-Islamic',
    'Canada': 'English-Speaking',
    'Chile': 'Latin America',
    'China': 'Confucian',
    'Colombia': 'Latin America',
    'Croatia': 'Catholic Europe',
    'Cyprus': 'Catholic Europe',
    'Czechia': 'Catholic Europe',
    'Denmark': 'Protestant Europe',
    'Ecuador': 'Latin America',
    'Egypt': 'African-Islamic',
    'Estonia': 'Orthodox Europe',
    'Ethiopia': 'African-Islamic',
    'Finland': 'Protestant Europe',
    'France': 'Catholic Europe',
    'Georgia': 'Orthodox Europe',
    'Germany': 'Protestant Europe',
    'Ghana': 'African-Islamic',
    'Greece': 'Orthodox Europe',
    'Guatemala': 'Latin America',
    'Haiti': 'Latin America',
    'Hong Kong': 'Confucian',
    'Hungary': 'Catholic Europe',
    'Iceland': 'Protestant Europe',
    'India': 'West & South Asia',
    'Indonesia': 'West & South Asia',
    'Iran (Islamic Republic of)': 'West & South Asia',
    'Iraq': 'African-Islamic',
    'Ireland': 'Catholic Europe',
    'Italy': 'Catholic Europe',
    'Japan': 'Confucian',
    'Jordan': 'African-Islamic',
    'Kazakhstan': 'Orthodox Europe',
    'Kenya': 'African-Islamic',
    'Korea (the Republic of)': 'Confucian',
    'Kuwait': 'African-Islamic',
    'Kyrgyzstan': 'West & South Asia',
    'Latvia': 'Orthodox Europe',
    'Lebanon': 'African-Islamic',
    'Libya': 'African-Islamic',
    'Lithuania': 'Orthodox Europe',
    'Luxembourg': 'Catholic Europe',
    'Macao': 'Confucian',
    'Malaysia': 'West & South Asia',
    'Maldives': 'West & South Asia',
    'Mali': 'African-Islamic',
    'Malta': 'Catholic Europe',
    'Mexico': 'Latin America',
    'Moldova (the Republic of)': 'Orthodox Europe',
    'Mongolia': 'Confucian',
    'Montenegro': 'Orthodox Europe',
    'Morocco': 'African-Islamic',
    'Myanmar': 'West & South Asia',
    'Netherlands (the)': 'Protestant Europe',
    'New Zealand': 'English-Speaking',
    'Nicaragua': 'Latin America',
    'Nigeria': 'African-Islamic',
    'Norway': 'Protestant Europe',
    'Pakistan': 'West & South Asia',
    'Palestine, State of': 'African-Islamic',
    'Peru': 'Latin America',
    'Philippines (the)': 'West & South Asia',
    'Poland': 'Catholic Europe',
    'Portugal': 'Catholic Europe',
    'Puerto Rico': 'Latin America',
    'Qatar': 'African-Islamic',
    'Republic of North Macedonia': 'Orthodox Europe',
    'Romania': 'Orthodox Europe',
    'Russian Federation (the)': 'Orthodox Europe',
    'Rwanda': 'African-Islamic',
    'Serbia': 'Orthodox Europe',
    'Singapore': 'Confucian',
    'Slovakia': 'Catholic Europe',
    'Slovenia': 'Catholic Europe',
    'South Africa': 'English-Speaking',
    'Spain': 'Catholic Europe',
    'Sweden': 'Protestant Europe',
    'Switzerland': 'Protestant Europe',
    'Taiwan (Province of China)': 'Confucian',
    'Tajikistan': 'West & South Asia',
    'Thailand': 'Confucian',
    'Trinidad and Tobago': 'Latin America',
    'Tunisia': 'African-Islamic',
    'Turkey': 'West & South Asia',
    'Ukraine': 'Orthodox Europe',
    'United Kingdom of Great Britain and Northern Ireland (the)': 'English-Speaking',
    'United States of America (the)': 'English-Speaking',
    'Uruguay': 'Latin America',
    'Uzbekistan': 'West & South Asia',
    'Venezuela (Bolivarian Republic of)': 'Latin America',
    'Viet Nam': 'Confucian',
    'Yemen': 'African-Islamic',
    'Zambia': 'African-Islamic',
    'Zimbabwe': 'African-Islamic',
}

In [90]:
# boolean values indicating whether the country is Islamic

islamic_countries = {
    'Albania': True,
    'Algeria': True,
    'Andorra': False,
    'Argentina': False,
    'Armenia': False,
    'Australia': False,
    'Austria': False,
    'Azerbaijan': True,
    'Bangladesh': True,
    'Belarus': False,
    'Belgium': False,
    'Bolivia (Plurinational State of)': False,
    'Bosnia and Herzegovina': True,
    'Brazil': False,
    'Bulgaria': False,
    'Burkina Faso': True,
    'Canada': False,
    'Chile': False,
    'China': False,
    'Colombia': False,
    'Croatia': False,
    'Cyprus': False,
    'Czechia': False,
    'Denmark': False,
    'Ecuador': False,
    'Egypt': True,
    'Estonia': False,
    'Ethiopia': False,
    'Finland': False,
    'France': False,
    'Georgia': False,
    'Germany': False,
    'Ghana': True,
    'Greece': False,
    'Guatemala': False,
    'Haiti': False,
    'Hong Kong': False,
    'Hungary': False,
    'Iceland': False,
    'India': False,
    'Indonesia': True,
    'Iran (Islamic Republic of)': True,
    'Iraq': True,
    'Ireland': False,
    'Italy': False,
    'Japan': False,
    'Jordan': True,
    'Kazakhstan': True,
    'Kenya': True,
    'Korea (the Republic of)': False,
    'Kuwait': True,
    'Kyrgyzstan': True,
    'Latvia': False,
    'Lebanon': True,
    'Libya': True,
    'Lithuania': False,
    'Luxembourg': False,
    'Macao': False,
    'Malaysia': True,
    'Maldives': True,
    'Mali': True,
    'Malta': False,
    'Mexico': False,
    'Moldova (the Republic of)': False,
    'Mongolia': False,
    'Montenegro': False,
    'Morocco': True,
    'Myanmar': False,
    'Netherlands (the)': False,
    'New Zealand': False,
    'Nicaragua': False,
    'Nigeria': True,
    'Norway': False,
    'Pakistan': True,
    'Palestine, State of': True,
    'Peru': False,
    'Philippines (the)': False,
    'Poland': False,
    'Portugal': False,
    'Puerto Rico': False,
    'Qatar': True,
    'Republic of North Macedonia': False,
    'Romania': False,
    'Russian Federation (the)': False,
    'Rwanda': True,
    'Serbia': False,
    'Singapore': False,
    'Slovakia': False,
    'Slovenia': False,
    'South Africa': False,
    'Spain': False,
    'Sweden': False,
    'Switzerland': False,
    'Taiwan (Province of China)': False,
    'Tajikistan': True,
    'Thailand': False,
    'Trinidad and Tobago': False,
    'Tunisia': True,
    'Turkey': True,
    'Ukraine': False,
    'United Kingdom of Great Britain and Northern Ireland (the)': False,
    'United States of America (the)': False,
    'Uruguay': False,
    'Uzbekistan': True,
    'Venezuela (Bolivarian Republic of)': False,
    'Viet Nam': False,
    'Yemen': True,
    'Zambia': True,
    'Zimbabwe': False,
}

In [74]:
# We need columns of "Islamic" and categorise the countries based on the region
# Add cultural regions column
country_codes = country_codes.copy()
country_codes['Cultural Region'] = country_codes['Country'].map(cultural_regions)
country_codes['Islamic'] = country_codes['Country'].map(islamic_countries)
country_codes.head()

In [75]:
country_codes.to_pickle("../data/country_codes.pkl")

# Methodology

In [64]:
print(f"{round(((len(subset_ivs_df) - len(subset_ivs_df.dropna()) )/ len(subset_ivs_df))*100, 2)} % of the data is incomplete")

Standardize the Data: Standardize the survey responses, considering the weights. You can use the StandardScaler from scikit-learn with sample weights.

In [65]:
weights = subset_ivs_df['weight'].values
features = subset_ivs_df[iv_qns].values
scaler = StandardScaler()
standardized_data = scaler.fit_transform(features, sample_weight=weights)
standardized_data

Handle Missing Data: Use pairwise deletion to compute the correlation matrix, taking into account the weights.

In [78]:
# random 10 by 20 matrix
data = np.random.rand(10, 20)
# Randomly mask 10% of the data
mask = np.random.choice([True, False], data.shape, p=[0.1, 0.9])
data[mask] = np.nan
data

In [73]:
# Verify standardization
print(pd.DataFrame(standardized_data).dropna().mean(), pd.DataFrame(standardized_data).dropna().std())


In [68]:
standardized_data.shape

In [70]:
# Function to calculate covariance matrix with pairwise deletion
def pairwise_covariance_matrix(data):
    n = data.shape[1]
    cov_matrix = np.zeros((n, n))
    counts = np.zeros((n, n))
    
    for i in range(n):
        for j in range(n):
            # Select the valid pairs
            valid_data = data.iloc[:, [i, j]].dropna()
            if valid_data.shape[0] > 1:  # Ensure there are enough data points to calculate covariance
                cov_matrix[i, j] = np.cov(valid_data.T)[0, 1]
                counts[i, j] = valid_data.shape[0]
    
    # Handle divisions and possible invalid operations
    with np.errstate(invalid='ignore', divide='ignore'):
        cov_matrix = np.divide(cov_matrix, counts)
    
    # Filling NaNs that might have arisen due to no valid pairs
    cov_matrix = np.nan_to_num(cov_matrix)
    
    return cov_matrix


# Calculate pairwise covariance matrix
cov_matrix = pairwise_covariance_matrix(pd.DataFrame(standardized_data))
cov_matrix

In [27]:
masked_data = np.ma.masked_invalid(standardized_data)
correlation_matrix = np.ma.corrcoef(masked_data, rowvar=False, allow_masked=True).data
correlation_matrix

Perform Weighted PCA: Apply PCA using the correlation matrix and include the observation weights. The PCA class in scikit-learn does not directly support weights, so you may need to use a custom implementation or modify the covariance matrix to account for weights.

In [28]:
# Perform PCA on the correlation matrix
pca = PCA()
principal_components = pca.fit_transform(correlation_matrix)

In [29]:
# Varimax rotation on the PCA loadings
rotator = Rotator(method='varimax')
rotated_loadings = rotator.fit_transform(principal_components)

In [30]:
# Rescale the rotated components
pc1 = rotated_loadings[0]
pc2= rotated_loadings[1]

In [31]:
country_codes

In [32]:
data_for_pca = subset_ivs_df[['country_code'] + iv_qns].copy()
# Create a DataFrame with the rotated components
data_for_pca['PC1'] = np.dot(data_for_pca.drop(columns='country_code').values, pc1)
# Drop country code and PC1 columns now
data_for_pca['PC2'] = np.dot(data_for_pca.drop(columns=['country_code', 'PC1']).values, pc2)
# Calculate the mean of the rescaled individual-level scores for each country
country_scores_pca = data_for_pca.groupby('country_code')[['PC1', 'PC2']].mean().reset_index()
# Mapping cultural regions to colors
cultural_region_colors = {
    'African-Islamic': 'black',
    'Confucian': 'deepskyblue',
    'Latin America': 'violet',
    'Protestant Europe': 'orange',
    'Catholic Europe': 'gold',
    'English-Speaking': 'green',
    'Orthodox Europe': 'blue',
    'West & South Asia': 'brown'
}

country_scores_pca

In [33]:
# Merge the country codes DataFrame with the country scores DataFrame
# Add country names and cultural regions to the DataFrame  
country_scores_pca = country_scores_pca.merge(country_codes, left_on='country_code', right_on='Numeric', how='left')
country_scores_pca

In [34]:
# Plot the Cultural Map
plt.figure(figsize=(14, 10))

# Plot each cultural region with corresponding color and style
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    for i, row in subset.iterrows():
        if row['Islamic']:
            plt.text(row['PC1'], row['PC2'], row['Country'], color=color, fontsize=10, fontstyle='italic')
        else:
            plt.text(row['PC1'], row['PC2'], row['Country'], color=color, fontsize=10)

# Create a scatter plot with colored points based on cultural regions
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    plt.scatter(subset['PC1'], subset['PC2'], label=region, color=color)

plt.xlabel('Traditional vs. Secular-rational values')
plt.ylabel('Survival vs. Self-expression values')
plt.title('Inglehart-Welzel Cultural Map')

# Add legend
plt.legend()

plt.grid(True)
plt.show()

In [35]:
country_scores_pca

In [92]:

x = country_scores_pca.dropna()['PC1']
y = country_scores_pca.dropna()['PC2']
all_cultural_regions = country_scores_pca.dropna()['Cultural Region']
categories = pd.Categorical(all_cultural_regions).codes
# create tuple of all_cultural_regions and categories
cultural_region_map = dict(zip(categories, all_cultural_regions ))


In [93]:
cultural_region_map

In [36]:
x = country_scores_pca.dropna()['PC1']
y = country_scores_pca.dropna()['PC2']
categories = country_scores_pca.dropna()['Cultural Region']
# create number map for categories
categories = pd.Categorical(categories)
categories = categories.codes
# add as a column to the DataFrame
categories

In [37]:
# Map for the categories
all_cultural_regions = country_scores_pca['Cultural Region']
# create tuple of all_cultural_regions and categories
cultural_region_map = dict(zip(categories, all_cultural_regions ))
cultural_region_map

In [38]:
data = np.column_stack((x, y)).astype(float)
labels = np.array(categories).astype(int)

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid
param_grid_fine = {
    'C': [500, 1000, 1500, 2000],
    'gamma': [0.05, 0.1, 0.15, 0.2],
    'kernel': ['rbf']
}

# Create a SVM model
svm = SVC()

# Create a GridSearchCV object
grid_search = GridSearchCV(svm, param_grid_fine, refit=True, verbose=2, cv=5)

# Fit the model
grid_search.fit(data, labels)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)


In [40]:
# Use the best parameters to train the SVM
best_svm = grid_search.best_estimator_
# Fit the best model
best_svm.fit(data, labels)

In [41]:
# Increase gamma value for RBF kernel
best_svm = SVC(C=10, kernel='rbf', gamma=0.2)
best_svm.fit(data, labels)

In [43]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Create a mesh grid
h = .01  # step size in the mesh
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict classifications for each point in the mesh
Z = best_svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Define color map
cmap = plt.cm.get_cmap('tab10', len(np.unique(labels)))
norm = mcolors.BoundaryNorm(np.arange(-0.5, len(np.unique(labels)), 1), cmap.N)

# Plot the Cultural Map
plt.figure(figsize=(14, 15))


# Plot the decision boundary
plt.contourf(xx, yy, Z, cmap=cmap, alpha=0.3, norm=norm)

# Plot each cultural region with corresponding color and style
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    for i, row in subset.iterrows():
        if row['Islamic']:
            plt.text(row['PC1'], row['PC2'], row['Country'], color=color, fontsize=10, fontstyle='italic')
        else:
            plt.text(row['PC1'], row['PC2'], row['Country'], color=color, fontsize=10)

# Create a scatter plot with colored points based on cultural regions
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    plt.scatter(subset['PC1'], subset['PC2'], label=region, color=color)

# Add in labels for the Cultural Regions
for i, region in enumerate(np.unique(labels)):
    plt.text(x[i], y[i], region, color=cmap(norm(region)), fontsize=10)

# Add legend
plt.legend()
plt.xlabel('Traditional vs. Secular-rational values')
plt.ylabel('Survival vs. Self-expression values')
plt.title('Inglehart-Welzel Cultural Map')
plt.grid(True)
# x and y axis limits
plt.xlim(0.7, 2.5)
plt.ylim(0, 7)
plt.show()

In [102]:
cmap

In [75]:
data_for_pca

In [39]:
# Merge the country codes DataFrame with the country scores DataFrame
# Add country names and cultural regions to the DataFrame  
country_scores_pca_full = data_for_pca.merge(country_scores_pca, left_on='country_code', right_on='Numeric', how='left')
country_scores_pca_full

In [40]:
# Keep PC1_x, PC2_x, Country, Cultural Region, Islamic
country_scores_pca_full = country_scores_pca_full[['PC1_x', 'PC2_x', 'Country', 'Cultural Region', 'Islamic']]
country_scores_pca_full = country_scores_pca_full.rename(columns={"PC1_x": "PC1", "PC2_x": "PC2"})
country_scores_pca_full

In [41]:
# Drop NA
country_scores_pca_full = country_scores_pca_full.dropna()
country_scores_pca_full

In [42]:
# Plot the Cultural Map
plt.figure(figsize=(14, 10))

# Create a scatter plot with colored points based on cultural regions
for region, color in cultural_region_colors.items():
    subset = country_scores_pca_full[country_scores_pca_full['Cultural Region'] == region]
    plt.scatter(subset['PC1'], subset['PC2'], label=region, color=color)

plt.xlabel('Traditional vs. Secular-rational values')
plt.ylabel('Survival vs. Self-expression values')
plt.title('Inglehart-Welzel Cultural Map')

# Add legend
plt.legend()

plt.grid(True)
plt.show()

In [94]:
x = country_scores_pca.dropna()['PC1']
y = country_scores_pca.dropna()['PC2']
all_cultural_regions = country_scores_pca.dropna()['Cultural Region']
categories = pd.Categorical(all_cultural_regions).codes

data = np.column_stack((x, y)).astype(float)
labels = np.array(categories).astype(int)

# create tuple of all_cultural_regions and categories
cultural_region_map = dict(zip(labels, all_cultural_regions))
# sort by numerical labels
cultural_region_map = dict(sorted(cultural_region_map.items()))
# Map to colours in cultural_region_colors

In [95]:
cultural_region_map

In [96]:
cultural_region_colors = {
    'African-Islamic': '#000000',
    'Confucian': '#56b4e9',
    'Latin America': '#cc79a7',
    'Protestant Europe': '#d55e00',
    'Catholic Europe': '#e69f00',
    'English-Speaking': '#009e73',
    'Orthodox Europe': '#0072b2',
    'West & South Asia': '#f0e442',
}

In [101]:
[cultural_region_colors[cultural_region] for cultural_region in  cultural_region_map.values()]