### Imports

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Import custom libraries
import util
from util import UtilityFunctions as uf

# reload the custom library, 
# Need to be executed every time new functions are added to util.py 
from importlib import reload
reload(util)   

#### Import dataset

In [None]:
# Load the CSV file
df = pd.read_csv('./../data/raw/wave 7 EVS_WVS_Joint_Csv_v5_0.csv', sep=',')


### Data explorations

#### data cleaning 
Drop columns that aren't relevant and modify others into numerical value 

In [None]:
usa_df = df[df['cntry'] == 840]

# drop following columns, doi, cntry_AN, lnge_iso, version, doi_gesis, doi_wvsa, reg_nuts1, reg_nuts2
columns_to_drop = ['doi_gesis','doi_wvsa', 'cntry_AN', 'lnge_iso', 'version', 'reg_nuts1', 'reg_nuts2', 'X002_02B','V002A_01','V001A_01']
uf.drop_columns(usa_df, columns_to_drop)

# set study title to 1 if it starts with EVS5, else 0 for WVS7
usa_df.loc[:, 'studytit'] = usa_df['studytit'].apply(lambda x: 1 if str(x).startswith('EVS5') else 0)
usa_df['studytit'] = usa_df['studytit'].astype(int)

# set versn_s to 2022 if it starts with 5, else 2024, deduced from the data
usa_df.loc[:, 'versn_s'] = usa_df['versn_s'].apply(lambda x: 2022 if str(x).startswith('5') else 2024)
usa_df['versn_s'] = usa_df['versn_s'].astype(int)

uf.print_non_numeric_columns(usa_df)



#### Adjust negative rows and drop columns with less than 2 unique row values

since anything negative is either don't know or refused to answer, NA or missing and columns with no unique values does not capture any variance

In [None]:
# Count rows with any negative value while handling strings
count_negative_rows = (usa_df.apply(pd.to_numeric, errors='coerce') < 0).any(axis=1).sum()
print(f"Rows with negative values: {count_negative_rows}")
usa_df = usa_df.apply(pd.to_numeric, errors='coerce')

# We gather that every row has at least one negative value, so we will replace them with 0 for now
usa_df = usa_df.map(lambda x: 0 if x < 0 else x)

# Drop columns with fewer than `threshold` unique values
threshold = 1
columns_to_drop = list(uf.few_unique_values_columns(usa_df, threshold).keys())
usa_df = uf.drop_columns(usa_df, columns_to_drop)

### Data exploration 

#### Select only numeric columns as PCA only works on numeric features and standardize data

In [None]:
political_scale_feature = 'E033'
df_numeric = usa_df.select_dtypes(include=[float, int])
# df_numeric = df_numeric.drop(columns=[political_scale_feature])  # Drop the column used for coloring

# Scale the data
scaler = StandardScaler() 
df_scaled = scaler.fit_transform(df_numeric)

#### Apply PCA

In [None]:
pca = PCA(n_components=3)
principal_components = pca.fit_transform(df_scaled)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2','PC3'])
pca_df[political_scale_feature] = df[political_scale_feature]  # Add the coloring feature

# Show explained variance ratio
print(pca.explained_variance_ratio_)

##### Plot PCA results, coloring <span style="color: #1E90FF;">Left leaning</span> blue and <span style="color: #B22222;"> Right leaning</span> red, based on feature E033 (Self positioning in political scale)


In [None]:
# Normalize the E033 values for color mapping
norm = plt.Normalize(pca_df[political_scale_feature].min(), pca_df[political_scale_feature].max())
cmap = plt.get_cmap('coolwarm')  # Get the colormap from blue to red

# Map E033 values to colors
colors = cmap(norm(pca_df[political_scale_feature]))

# Create a scatter plot
plt.figure(figsize=(10, 6))
scatter = plt.scatter(x=pca_df['PC1'], y=pca_df['PC2'], c=colors, alpha=0.7)


# Customize plot
plt.title('PCA Plot Colored by Self positioning in political scale (E033)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid()
plt.show()

In [None]:
sns.set(font_scale=1)

corr = usa_df.corr(method='pearson', numeric_only=True)
fig, ax = plt.subplots(figsize=(50,50))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap='RdBu_r', annot=True, linewidth=0.5, ax=ax, annot_kws={"size":20},)

In [None]:
# sns.set(font_scale=.5)
# usa_df.corrwith(usa_df['E033']).abs().sort_values().plot(kind='barh',figsize=(10,10))

# Create a list of the top 40 features that are most correlated with E033
NUMBER_OF_FEATURES = 40
corr = usa_df.drop(political_scale_feature, axis=1).corrwith(usa_df[political_scale_feature]).abs().sort_values(ascending=False)
n_features = corr.head(NUMBER_OF_FEATURES)

filter_df = uf.filter_features(usa_df, n_features.index)
corr = filter_df.corr(method='pearson', numeric_only=True)

sns.set(font_scale=1.5)
fig, ax = plt.subplots(figsize=(50,50))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap='RdBu_r', annot=True, linewidth=0.5, ax=ax, annot_kws={"size":18},)


1. PCA
    PCA 2, see if clusters appear 
    Color the pca scatterplot in colors tied to left or right leaning.

2. Inspect covariance matrix
    feature selection