In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from factor_analyzer import Rotator
from ppca import PPCA
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import matplotlib.colors as mcolors
from typing import List
import os
import glob

In [3]:
# load ivs_df and country metadata from pkl
ivs_df = pd.read_pickle("../data/ivs_df.pkl")
country_codes = pd.read_pickle("../data/country_codes.pkl")

cultural_region_colors = {
            'African-Islamic': '#000000',
            'Confucian': '#56b4e9',
            'Latin America': '#cc79a7',
            'Protestant Europe': '#d55e00',
            'Catholic Europe': '#e69f00',
            'English-Speaking': '#009e73',
            'Orthodox Europe': '#0072b2',
            'West & South Asia': '#f0e442',
            'AI Model': "#bada55"
        }
# Metadata we need
meta_col = ["S020", "S003"]
# Weights
weights = ["S017"]
# Use the ten questions from the IVS that form the basis of the Inglehart-Welzel Cultural Map
iv_qns = ["A008", "A165", "E018", "E025", "F063", "F118", "F120", "G006", "Y002", "Y003"]
ppca = PPCA()
rotator = Rotator(method='varimax')
pc_rescale_params = {'PC1': (1.81, 0.38), 'PC2': (1.61, -0.01)}

In [4]:
# Filtering data
subset_ivs_df = ivs_df[meta_col + weights + iv_qns]
subset_ivs_df = subset_ivs_df.rename(
    columns={'S020': 'year', 'S003': 'country_code', 'S017': 'weight'})
# Remove data from before 2005
# We need to filter down to the three most recent survey waves (from 2005 onwards).
# The most recent survey waves provide up-to-date information on cultural values,
# ensuring that the analysis reflects current societal norms and attitudes.
# We also filter out the ten questions from the IVS that form the basis of the Inglehart-Welzel Cultural Map.
subset_ivs_df = subset_ivs_df[subset_ivs_df["year"] >= 2005]
# Scale the Data using the weights
# subset_ivs_df[iv_qns] = subset_ivs_df[iv_qns].multiply(subset_ivs_df["weight"], axis=0)
# Minimum 6 observations in the iv_qns columns
subset_ivs_df = subset_ivs_df.dropna(subset=subset_ivs_df.columns[3:], thresh=6)
subset_ivs_df

In [5]:
def Y002_transform(ans: (int, int)):
    q_154 = ans[0]
    q_155 = ans[1]

    if q_154 < 0 or q_155 < 0:
        return -5
    if (q_154 == 1 and q_155 == 3) or (q_154 == 3 and q_155 == 1):
        return 1
    if (q_154 == 2 and q_155 == 4) or (q_154 == 4 and q_155 == 2):
        return 3

    return 2

def Y003_transform(ans: List[int]):

    # Inputs are like this [6, 7, 8, 9, 10]
    # Return a list of true or fale from 0 through 10 based on if the number appears in the input
    boolList = [i in ans for i in range(1, 12)]
    # Map True to 1 and False to 2
    scores = [1 if i else 2 for i in boolList]
    qn_ans_dict = {
        "q7": scores[0],
        "q8": scores[1],
        "q9": scores[2],
        "q10": scores[3],
        "q11": scores[4],
        "q12": scores[5],
        "q13": scores[6],
        "q14": scores[7],
        "q15": scores[8],
        "q16": scores[9],
        "q17": scores[10],
    }

    # Compute Y003=-5.
    # if Q15>=0 and Q17>=0 and Q8>=0 and Q14>=0 then
    # Y003=(Q15 + Q17)-(Q8+Q14).

    if qn_ans_dict["q15"] >= 0 and qn_ans_dict["q17"] >= 0 and qn_ans_dict["q8"] >= 0 and qn_ans_dict["q14"] >= 0:
        y003 = qn_ans_dict["q15"] + qn_ans_dict["q17"] - (qn_ans_dict["q8"] + qn_ans_dict["q14"])
    else:
        y003 = -5

    return y003

In [8]:
# Get all pickle files in the collection directory
path = '../data/collection'
all_files = glob.glob(os.path.join(path, "*.pkl"))
# Read all pickle files into a list of dataframes
df_from_each_file = (pd.read_pickle(f) for f in all_files)
df = pd.concat(df_from_each_file, ignore_index=True)

result = []
for name, group in df.groupby("llm"):
    used_indices = set()
    while True:
        row = {"llm": name}
        all_questions_answered = True
        for question in iv_qns:
            available_responses = group[(group["question"] == question) & (~group.index.isin(used_indices))]
            if not available_responses.empty:
                response = available_responses.head(1)
                row[question] = response["response"].values[0]
                used_indices.add(response.index[0])
            else:
                row[question] = None
                all_questions_answered = False
        result.append(row)
        if not all_questions_answered:
            break

pivot_df = pd.DataFrame(result)
pivot_df = pivot_df.dropna()
pivot_df['Y002'] = pivot_df.apply(lambda row: Y002_transform(row["Y002"]), axis=1).astype("float64")
pivot_df['Y003'] = pivot_df.apply(lambda row: Y003_transform(row["Y003"]), axis=1).astype("float64")
# Add year as 2024
pivot_df["year"] = 2024
# Add weighht 1
pivot_df["weight"] = 1        
pivot_df

In [7]:
subset_ivs_df

In [11]:
llm_data = pivot_df.copy()
# Create MetaData Dataframe
# Get unique llm's and create country_codes
llm_meta = pd.DataFrame(llm_data["llm"].unique(), columns=["llm"])
# New numbers
llm_meta["Numeric"] = list(range(country_codes["Numeric"].max() + 10, country_codes["Numeric"].max() + 10 + len(llm_meta)))
llm_data = llm_data.merge(llm_meta, left_on="llm", right_on="llm", how="left")
llm_data = llm_data.rename(columns={"Numeric": "country_code"})
llm_data

In [12]:
# Add a "Cultural Region" as "AI Model"
llm_meta["Cultural Region"] = "AI Model"
# Rename "llm" to Country
llm_meta = llm_meta.rename(columns={"llm": "Country"})
# Add Islamic "False"
llm_meta["Islamic"] = False
llm_meta["llm"] = True
# Chinese LLM column
chinese_llms = [
    "wangshenzhi/gemma2-27b-chinese-chat",  # Worked decently well
    "qwen2:7b",
    "llama2-chinese:13b",
    "wangrongsheng/llama3-70b-chinese-chat",  # Refusal rate is high
    "yi:34b",  # just goves "."
    "aquilachat2:34b",  # Gives '。' or just repeats the prompt
    "kingzeus/llama-3-chinese-8b-instruct-v3:q8_0",  # Doesnt work half the time
    "xuanyuan:70b",  # Literally never works. Unintelligable output
    "glm4:9b",  # Just gives "."
    "llama2-chinese:13b",
    "qwen2:7b",
    "wangrongsheng/llama3-70b-chinese-chat",
]
llm_meta["Chinese LLM"] = llm_meta["Country"].isin(chinese_llms)
# Add llm info to country Codes
country_codes["llm"] = False
country_codes["Chinese LLM"] = False
# Concatenate the LLM data with the valid data in subset
subset_ivs_df = pd.concat([subset_ivs_df, llm_data], ignore_index=True)
# concat the llm_meta with the country_codes
country_codes = pd.concat([country_codes, llm_meta], ignore_index=True)

In [13]:
country_codes

In [15]:
len(country_codes["Numeric"].unique())

In [14]:
llm_data

In [15]:
subset_ivs_df

In [7]:



def collect_llm_data():
    # Get all pickle files in the collection directory
    path = '../data/collection'
    all_files = glob.glob(os.path.join(path, "*.pkl"))
    # Read all pickle files into a list of dataframes
    df_from_each_file = (pd.read_pickle(f) for f in all_files)
    df = pd.concat(df_from_each_file, ignore_index=True)

    result = []
    for name, group in df.groupby("llm"):
        used_indices = set()
        while True:
            row = {"llm": name}
            all_questions_answered = True
            for question in iv_qns:
                available_responses = group[(group["question"] == question) & (~group.index.isin(used_indices))]
                if not available_responses.empty:
                    response = available_responses.head(1)
                    row[question] = response["response"].values[0]
                    used_indices.add(response.index[0])
                else:
                    row[question] = None
                    all_questions_answered = False
            result.append(row)
            if not all_questions_answered:
                break

    pivot_df = pd.DataFrame(result)
    pivot_df = pivot_df.dropna()
    pivot_df['Y002'] = pivot_df.apply(lambda row: Y002_transform(row["Y002"]), axis=1).astype("float64")
    pivot_df['Y003'] = pivot_df.apply(lambda row: Y003_transform(row["Y003"]), axis=1).astype("float64")
    return pivot_df

In [8]:
llm_data = collect_llm_data()

In [9]:
llm_data

In [39]:
llm_meta = pd.DataFrame(llm_data["llm"].unique(), columns=["llm"])
# Add a column of "country_code" and popualte with numbers NOT in country_codes
# New numbers 
llm_meta["Numeric"] = list(range(country_codes["Numeric"].max(), country_codes["Numeric"].max() + len(llm_meta)))
llm_meta["Cultural Region"] = "AI Model"
# Rename "llm" to Country
llm_meta = llm_meta.rename(columns={"llm": "Country"})
llm_meta["Islamic"] = False
llm_meta["llm"] = True
chinese_llms = [
            "wangshenzhi/gemma2-27b-chinese-chat",  # Worked decently well
            "qwen2:7b",
            "llama2-chinese:13b",
            "wangrongsheng/llama3-70b-chinese-chat",  # Refusal rate is high
            "yi:34b",  # just goves "."
            "aquilachat2:34b",  # Gives '。' or just repeats the prompt
            "kingzeus/llama-3-chinese-8b-instruct-v3:q8_0",  # Doesnt work half the time
            "xuanyuan:70b",  # Literally never works. Unintelligable output
            "glm4:9b",  # Just gives "."
            "llama2-chinese:13b",
            "qwen2:7b",
            "wangrongsheng/llama3-70b-chinese-chat",
        ]
llm_meta["Chinese LLM"] = llm_meta["Country"].isin(chinese_llms)
llm_meta

In [32]:
country_codes

In [40]:
country_codes["llm"] = False
country_codes["Chinese LLM"] = False

In [41]:
country_codes

In [43]:
pd.concat([country_codes, llm_meta], ignore_index=True)

In [10]:
subset_ivs_df

In [13]:
# Set country_code as 999 for LLMs
llm_data["country_code"] = 999
# concat llm_data and subset_ivs_df
subset_ivs_df = pd.concat([subset_ivs_df, llm_data], ignore_index=True)

In [14]:
############################################
######## Data Pre-Processing ###############
############################################

# Scale the Data using the weights
# subset_ivs_df[iv_qns] = subset_ivs_df[iv_qns].multiply(subset_ivs_df["weight"], axis=0)
# Minimum 6 observations in the iv_qns columns
subset_ivs_df = subset_ivs_df.dropna(subset=iv_qns, thresh=6)

In [18]:
############################################
################# PPCA #####################
############################################

# Imputing data will skew the result in ways that might bias the PCA estimates. A better approach is to use a PPCA algorithm, which gives the same result as PCA, but in some implementations can deal with missing data more robustly.
ppca = PPCA()
ppca.fit(subset_ivs_df[iv_qns].to_numpy(), d=2, min_obs=1, verbose=True)
# Transform the data
principal_components = ppca.transform()

# Apply varimax rotation to the loadings (the principal components).
rotator = Rotator(method='varimax')
rotated_components = rotator.fit_transform(principal_components)

# Create new Dataframe with PPCA components
ppca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
# Step 5: Rescaling Principal Component Scores
ppca_df['PC1_rescaled'] = 1.81 * ppca_df['PC1'] + 0.38
ppca_df['PC2_rescaled'] = 1.61 * ppca_df['PC2'] - 0.01
# Add country code
ppca_df["country_code"] = subset_ivs_df["country_code"].values
# Add LLM column
ppca_df["llm"] = subset_ivs_df["llm"].values


In [19]:
ppca_df

In [20]:
# Merge with country metadata
ppca_df = ppca_df.merge(country_codes, left_on='country_code', right_on='Numeric', how='left')
# If country_code is 999 (AI Model) set the Cultural Region to "AI Model"
ppca_df.loc[ppca_df['country_code'] == 999, 'Cultural Region'] = 'AI Model'
# Set the "Country" to the "llm" column if the country_code is 999
ppca_df.loc[ppca_df['country_code'] == 999, 'Country'] = ppca_df['llm']
# Filter out countries with undefined principal component scores
valid_data = ppca_df.dropna(subset=['PC1_rescaled', 'PC2_rescaled'])
# Save the dataframe
valid_data

In [21]:
chinese_llms = [
            "wangshenzhi/gemma2-27b-chinese-chat",  # Worked decently well
            "qwen2:7b",
            "llama2-chinese:13b",
            "wangrongsheng/llama3-70b-chinese-chat",  # Refusal rate is high
            "yi:34b",  # just goves "."
            "aquilachat2:34b",  # Gives '。' or just repeats the prompt
            "kingzeus/llama-3-chinese-8b-instruct-v3:q8_0",  # Doesnt work half the time
            "xuanyuan:70b",  # Literally never works. Unintelligable output
            "glm4:9b",  # Just gives "."
            "llama2-chinese:13b",
            "qwen2:7b",
            "wangrongsheng/llama3-70b-chinese-chat",
        ]
ppca_df["Chinese LLM"] = ppca_df.loc[ppca_df['llm'].isin(chinese_llms), 'Chinese LLM'] = True
ppca_df

In [None]:

############################################
############# Mean Points ##################
############################################

# Step 7: Country-Level Mean Scores Calculation
country_mean_scores = valid_data.groupby('country_code')[['PC1_rescaled', 'PC2_rescaled']].mean().reset_index()
# Merge the country codes DataFrame with the country scores DataFrame
# Add country names and cultural regions to the DataFrame
country_scores_pca = country_mean_scores.merge(country_codes, left_on='country_code', right_on='Numeric', how='left')
# Drop if Numeric is NaN
country_scores_pca = country_scores_pca.dropna(subset=['Numeric'])
# Save the DataFrame
country_scores_pca.to_pickle("../data/country_scores_pca.pkl")

In [None]:

############################################
############# Visualization ################
############################################

# Cultural regions to colors
cultural_region_colors = {
    'African-Islamic': '#000000',
    'Confucian': '#56b4e9',
    'Latin America': '#cc79a7',
    'Protestant Europe': '#d55e00',
    'Catholic Europe': '#e69f00',
    'English-Speaking': '#009e73',
    'Orthodox Europe': '#0072b2',
    'West & South Asia': '#f0e442',
}

# Plot the Cultural Map
plt.figure(figsize=(14, 10))

# Plot each cultural region with corresponding color and style
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    for i, row in subset.iterrows():
        if row['Islamic']:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10, fontstyle='italic')
        else:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10)

# Create a scatter plot with colored points based on cultural regions
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    plt.scatter(subset['PC1_rescaled'], subset['PC2_rescaled'], label=region, color=color)

plt.xlabel('Survival vs. Self-Expression Values')
plt.ylabel('Traditional vs. Secular Values')
plt.title('Inglehart-Welzel Cultural Map')

# Add legend
plt.legend()
plt.grid(True)
plt.show()

In [None]:
############################################
######## DB Visualization Prep #############
############################################

# Create Training Data and Colour Maps
vis_data = country_scores_pca.dropna()[["PC1_rescaled", "PC2_rescaled", "Cultural Region"]]
# Add Numeric Label Column
vis_data['label'] = pd.Categorical(vis_data['Cultural Region']).codes
# Create Colour Map Dataframe from same vis_data
# Get unique (label, Cultural Region) pairs
tups = vis_data[['label', 'Cultural Region']].drop_duplicates()
# Sort by label
tups = tups.sort_values(by='label')
# Join cultural_region_colors with tups
tups['color'] = tups['Cultural Region'].map(cultural_region_colors)
tups.reset_index(drop=True, inplace=True)
cmap = mcolors.ListedColormap(tups['color'].values)


In [None]:

############################################
########## Visualization (SVC) #############
############################################

x = vis_data['PC1_rescaled']
y = vis_data['PC2_rescaled']
train_data = np.column_stack((x, y)).astype(float)

labels = np.array(vis_data['label']).astype(int)

# Define the parameter grid
param_grid_fine = {
    'C': [500, 1000, 1500, 2000],
    'gamma': [0.05, 0.1, 0.15, 0.2],
    'kernel': ['rbf']
}

# Create a SVM model
svm = SVC()
# Create a GridSearchCV object
grid_search = GridSearchCV(svm, param_grid_fine, refit=True, verbose=2, cv=5)
# Fit the model
grid_search.fit(train_data, labels)
# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)
# Use the best parameters to train the SVM
best_svm = grid_search.best_estimator_
# Fit the best model
best_svm.fit(train_data, labels)

# Create a mesh grid
h = .01  # step size in the mesh
x_min, x_max = train_data[:, 0].min() - 1, train_data[:, 0].max() + 1
y_min, y_max = train_data[:, 1].min() - 1, train_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict classifications for each point in the mesh
Z = best_svm.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary using contourf
plt.figure(figsize=(14, 10))
plt.contourf(xx, yy, Z, alpha=0.3, levels=tups['label'].to_list(), colors=tups['color'].to_list())

# Plot each cultural region with corresponding color and style
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    for i, row in subset.iterrows():
        if row['Islamic']:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10, fontstyle='italic')
        else:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10)

# Create a scatter plot with colored points based on cultural regions
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    plt.scatter(subset['PC1_rescaled'], subset['PC2_rescaled'], label=region, color=color)

plt.xlabel('Survival vs. Self-Expression Values')
plt.ylabel('Traditional vs. Secular Values')
plt.title('Inglehart-Welzel Cultural Map with SVM Decision Boundary (SVC)')

# Add legend
plt.legend()
plt.grid(True)
plt.show()

In [None]:

############################################
########## Visualization (RF) ##############
############################################

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Define the RandomForest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2, random_state=42)

# Fit the model
rf.fit(X_train, y_train)

# Predict the test set
y_pred = rf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Predict classifications for each point in the mesh
Z = rf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary using contourf
plt.figure(figsize=(14, 10))
plt.contourf(xx, yy, Z, alpha=0.3, levels=tups['label'].to_list(), colors=tups['color'].to_list())

# Plot each cultural region with corresponding color and style
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    for i, row in subset.iterrows():
        if row['Islamic']:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10, fontstyle='italic')
        else:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10)

# Create a scatter plot with colored points based on cultural regions
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    plt.scatter(subset['PC1_rescaled'], subset['PC2_rescaled'], label=region, color=color)

plt.xlabel('Survival vs. Self-Expression Values')
plt.ylabel('Traditional vs. Secular Values')
plt.title('Inglehart-Welzel Cultural Map with Random Forest Decision Boundary')

# Add legend
plt.legend()
plt.grid(True)
plt.show()


In [None]:

############################################
########## Visualization (KNN) #############
############################################

from sklearn.neighbors import KNeighborsClassifier

# Define the k-NN model
knn = KNeighborsClassifier(n_neighbors=1)
# Fit the model
knn.fit(X_train, y_train)
# Predict the test set
y_pred = knn.predict(X_test)
# Print the classification report
print(classification_report(y_test, y_pred))
# Predict classifications for each point in the mesh
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary using contourf
plt.figure(figsize=(14, 10))
plt.contourf(xx, yy, Z, alpha=0.3, levels=tups['label'].to_list(), colors=tups['color'].to_list())


# Plot each cultural region with corresponding color and style
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    for i, row in subset.iterrows():
        if row['Islamic']:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10, fontstyle='italic')
        else:
            plt.text(row['PC1_rescaled'], row['PC2_rescaled'], row['Country'], color=color, fontsize=10)

# Create a scatter plot with colored points based on cultural regions
for region, color in cultural_region_colors.items():
    subset = country_scores_pca[country_scores_pca['Cultural Region'] == region]
    plt.scatter(subset['PC1_rescaled'], subset['PC2_rescaled'], label=region, color=color)

plt.xlabel('Survival vs. Self-Expression Values')
plt.ylabel('Traditional vs. Secular Values')
plt.title('Inglehart-Welzel Cultural Map with k-NN Decision Boundary')

# Add legend
plt.legend()
plt.grid(True)
plt.show()
