In [7]:
import pandas as pd
import numpy as np
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

In [8]:
country_scores_pca = pd.read_pickle('../data/country_scores_pca.pkl')

In [9]:
cultural_region_colors = {
    'African-Islamic': '#000000',
    'Confucian': '#56b4e9',
    'Latin America': '#cc79a7',
    'Protestant Europe': '#d55e00',
    'Catholic Europe': '#e69f00',
    'English-Speaking': '#009e73',
    'Orthodox Europe': '#0072b2',
    'West & South Asia': '#f0e442',
}

In [10]:
data = country_scores_pca.dropna()[["PC1_rescaled", "PC2_rescaled", "Cultural Region"]]
data

In [11]:
data['label'] = pd.Categorical(data['Cultural Region']).codes
data

In [12]:
# Get unique (label,  Cultural Region) pairs
tups = data[['label', 'Cultural Region']].drop_duplicates()
# sort by label
tups = tups.sort_values(by='label')
# Join cultural_region_colors with tups
tups['color'] = tups['Cultural Region'].map(cultural_region_colors)
tups.reset_index(drop=True, inplace=True)
tups

In [13]:
x = data['PC1_rescaled']
y = data['PC2_rescaled']
labels = np.array(data['label']).astype(int) 

In [14]:
train_data = np.column_stack((x, y)).astype(float)

In [15]:
cmap = mcolors.ListedColormap(tups['color'].values)
cmap

In [17]:
data

In [18]:
labels

In [None]:
def label_map(label):
    return cultural_region_map[label]

In [17]:
x = np.linspace(-5, 5, 101)
y = np.linspace(-5, 5, 101)
# full coordinate arrays
xx, yy = np.meshgrid(x, y)
zz = np.sqrt(xx**2 + yy**2)
xx.shape, yy.shape, zz.shape

In [18]:
# sparse coordinate arrays
xs, ys = np.meshgrid(x, y, sparse=True)
zs = np.sqrt(xs**2 + ys**2)
xs.shape, ys.shape, zs.shape
((1, 101), (101, 1), (101, 101))
np.array_equal(zz, zs)

In [19]:
h = plt.contourf(x, y, zs, cmap=cmap)
plt.axis('scaled')
plt.colorbar()
plt.show()

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from factor_analyzer import Rotator
from ppca import PPCA
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import matplotlib.colors as mcolors
from scipy.ndimage import center_of_mass

# Load ivs_df and country metadata from pkl
ivs_df = pd.read_pickle("../data/ivs_df.pkl")
country_codes = pd.read_pickle("../data/country_codes.pkl")

############################################
######## Data Preparation ##################
############################################

# Filtering data
meta_col = ["S020", "S003"]
weights = ["S017"]
iv_qns = ["A008", "A165", "E018", "E025", "F063", "F118", "F120", "G006", "Y002", "Y003"]
subset_ivs_df = ivs_df[meta_col + weights + iv_qns]
subset_ivs_df = subset_ivs_df.rename(columns={'S020': 'year', 'S003': 'country_code', 'S017': 'weight'})
subset_ivs_df = subset_ivs_df[subset_ivs_df["year"] >= 2005]

############################################
######## Data Pre-Processing ###############
############################################

subset_ivs_df = subset_ivs_df.dropna(subset=iv_qns, thresh=6)

############################################
################# PPCA #####################
############################################

ppca = PPCA()
ppca.fit(subset_ivs_df[iv_qns].to_numpy(), d=2, min_obs=1, verbose=True)
principal_components = ppca.transform()

rotator = Rotator(method='varimax')
rotated_components = rotator.fit_transform(principal_components)

ppca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
ppca_df['PC1_rescaled'] = 1.81 * ppca_df['PC1'] + 0.38
ppca_df['PC2_rescaled'] = 1.61 * ppca_df['PC2'] - 0.01
ppca_df["country_code"] = subset_ivs_df["country_code"].values
ppca_df = ppca_df.merge(country_codes, left_on='country_code', right_on='Numeric', how='left')
valid_data = ppca_df.dropna(subset=['PC1_rescaled', 'PC2_rescaled'])

############################################
############# Mean Points ##################
############################################

country_mean_scores = valid_data.groupby('country_code')[['PC1_rescaled', 'PC2_rescaled']].mean().reset_index()
country_scores_pca = country_mean_scores.merge(country_codes, left_on='country_code', right_on='Numeric', how='left')
country_scores_pca = country_scores_pca.dropna(subset=['Numeric'])


############################################
############# Visualization ################
############################################

cultural_region_colors = {
    'African-Islamic': '#000000',
    'Confucian': '#56b4e9',
    'Latin America': '#cc79a7',
    'Protestant Europe': '#d55e00',
    'Catholic Europe': '#e69f00',
    'English-Speaking': '#009e73',
    'Orthodox Europe': '#0072b2',
    'West & South Asia': '#f0e442',
}

plt.figure(figsize=(14, 10))

for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    for i, row in subset.iterrows():
        if row['Islamic']:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10, fontstyle='italic')
        else:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10)

for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    plt.scatter(subset['PC1_rescaled'], subset['PC2_rescaled'], label=region, color=color)

plt.xlabel('Survival vs. Self-Expression Values')
plt.ylabel('Traditional vs. Secular Values')
plt.title('Inglehart-Welzel Cultural Map')
plt.legend()
plt.grid(True)

# Add region labels
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    region_center = subset[['PC1_rescaled', 'PC2_rescaled']].mean()
    plt.text(region_center['PC1_rescaled'], region_center['PC2_rescaled'], region, color=color, fontsize=12, weight='bold')

plt.show()

############################################
######## DB Visualization Prep #############
############################################

vis_data = country_scores_pca.dropna()[["PC1_rescaled", "PC2_rescaled", "Cultural Region"]]
vis_data['label'] = pd.Categorical(vis_data['Cultural Region']).codes
tups = vis_data[['label', 'Cultural Region']].drop_duplicates()
tups = tups.sort_values(by='label')
tups['color'] = tups['Cultural Region'].map(cultural_region_colors)
tups.reset_index(drop=True, inplace=True)
cmap = mcolors.ListedColormap(tups['color'].values)


In [42]:
############################################
########## Visualization (SVC) #############
############################################

x = vis_data['PC1_rescaled']
y = vis_data['PC2_rescaled']
train_data = np.column_stack((x, y)).astype(float)
labels = np.array(vis_data['label']).astype(int)

param_grid_fine = {
    'C': [500, 1000, 1500, 2000],
    'gamma': [0.05, 0.1, 0.15, 0.2],
    'kernel': ['rbf']
}

svm = SVC()
grid_search = GridSearchCV(svm, param_grid_fine, refit=True, verbose=2, cv=5)
grid_search.fit(train_data, labels)
print("Best parameters found: ", grid_search.best_params_)
best_svm = grid_search.best_estimator_
best_svm.fit(train_data, labels)

h = .01
x_min, x_max = train_data[:, 0].min() - 1, train_data[:, 0].max() + 1
y_min, y_max = train_data[:, 1].min() - 1, train_data[:, 1].max() + 1




xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = best_svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# sparse coordinate arrays
xs, ys = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h), sparse=True)
zs = best_svm.predict(np.c_[xs.ravel(), ys.ravel()])


In [44]:
tups

In [31]:
cmap = mcolors.ListedColormap(tups['color'].to_list())
cmap

In [43]:
plt.figure(figsize=(14, 10))
contour = plt.contourf(xx, yy, Z, alpha=1, cmap=cmap)

# Add contour labels
plt.clabel(contour, inline=True, fontsize=15, fmt='%d')


for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    plt.scatter(subset['PC1_rescaled'], subset['PC2_rescaled'], label=region, color=color)

plt.xlabel('Survival vs. Self-Expression Values')
plt.ylabel('Traditional vs. Secular Values')
plt.title('Inglehart-Welzel Cultural Map with SVM Decision Boundary (SVC)')
plt.legend()
plt.grid(True)

# Add region labels
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    region_center = subset[['PC1_rescaled', 'PC2_rescaled']].mean()
    plt.text(region_center['PC1_rescaled'], region_center['PC2_rescaled'], region, color=color, fontsize=12, weight='bold')

plt.colorbar()
plt.show()

In [45]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets


def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


# import some data to play with
iris = datasets.load_iris()
# Take the first two features. We could avoid this by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.LinearSVC(C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))
models = (clf.fit(X, y) for clf in models)

# title for the plots
titles = ('SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial (degree 3) kernel')

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy,
                  cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('Sepal length')
    ax.set_ylabel('Sepal width')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()