In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from costometer.utils import set_font_sizes

set_font_sizes()

ModuleNotFoundError: No module named 'costometer'

# Prepare data

In [None]:
# only look at second questionnaire pilot, since first pilot often has more questionnaire items for psychiatric data
combined_scores = pd.read_csv("../../data/processed/quest_second/combined_scores.csv")

# drop rows with nans (optional crt, IQ, 2 participants who didn't see second part)
combined_scores=combined_scores.dropna(axis=0)
len(combined_scores)

In [None]:
nonnumeric_cols=["gender", "pid", "age", "colorblind"]

In [None]:
normalizer = Normalizer().fit(combined_scores[combined_scores.columns.difference(nonnumeric_cols)])
first_X = normalizer.transform(combined_scores[combined_scores.columns.difference(nonnumeric_cols)])

# PCA
## Find best number of components

In [None]:
explained_variance = []
# maximum number of components is number of questionnaire/subscales
for n_components in range(1, len(combined_scores.columns.difference(nonnumeric_cols))+1):
    pca = PCA(n_components=n_components)
    pca.fit(first_X)
    explained_variance.append(sum(pca.explained_variance_ratio_))

In [None]:
ax = plt.figure(figsize=(12, 10))
plt.plot(range(1, len(combined_scores.columns.difference(nonnumeric_cols))+1),explained_variance, 'o-')
plt.axvline(12, color='#8c8a8a', linestyle=':')
plt.title("Scree plot")
plt.ylabel("Explained variance")
plt.xlabel("Number Components")

## Look at components

In [None]:
pca = PCA(n_components=12)
pca.fit(first_X)
res=pd.DataFrame(pca.components_,columns=combined_scores.columns.difference(nonnumeric_cols),index = [f"factor{x}" for x in range(1,13)])


In [None]:
res.sort_values(by="factor1", axis=1)

In [None]:
res.sort_values(by="factor2", axis=1)

In [None]:
res.sort_values(by="factor3", axis=1)

# K-Means Clustering (more sparse)
## Find best number of components

In [None]:
inertia = []
# maximum number of components is number of questionnaire/subscales
for n_clusters in range(1, len(combined_scores.columns.difference(nonnumeric_cols))+1):
    kmeans = KMeans(n_clusters=n_clusters, random_state=91)
    kmeans.fit(first_X)
    inertia.append(kmeans.inertia_)

In [None]:
ax = plt.figure(figsize=(12, 10))
plt.plot(range(1, len(combined_scores.columns.difference(nonnumeric_cols))+1),inertia, 'o-')
plt.axvline(12, color='#8c8a8a', linestyle=':')
plt.title("Scree plot")
plt.ylabel("Inertia (within-cluster sum-of-squares)")
plt.xlabel("Number Components")

## Look at clusters

In [None]:
kmeans = KMeans(n_clusters=12, random_state=91)
kmeans.fit(first_X)

res=pd.DataFrame(kmeans.cluster_centers_, columns=combined_scores.columns.difference(nonnumeric_cols))

In [None]:
res

In [None]:
res.sort_values(by=0, axis=1)

In [None]:
res.sort_values(by=1, axis=1)

In [None]:
res.sort_values(by=2, axis=1)