# Analiza Wielowymiarowa - zajecia 6 - Analiza czynnikowa

In [None]:
from multidim.utils import resolve_stata, load_stata

STATA_PATH, STATA_TYPE = resolve_stata(version = 18, stype = "se")
# make sure they are proper ones
STATA_PATH, STATA_TYPE

In [None]:
load_stata(STATA_PATH, STATA_TYPE)

In [None]:
# Załadowanie bibliotek
import pandas as pd
import numpy as np

#https://scikit-learn.org/stable/modules/decomposition.html#fa
#https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FactorAnalysis.html
from sklearn import decomposition
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
#https://factor-analyzer.readthedocs.io/en/latest/factor_analyzer.html
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD 
from scipy.linalg import svd

from multidim.funs import corr_mat


## Analiza Czynnikowa

### Przyklad 1 -- zaczerpniety z materialow pani Natalii Nehrebeckiej

In [None]:
%%stata
/*Ponizej przedstawimy analize czynnikowa metoda najwiekszej wiarogodnosci. Zaleca
sie na wstepie analize skladowych glownych, aby ustalic przyblizona liczbe czynnikow*/

/*Dane wejsciowe do analizy czynnikowej moga miec postac macierzy kowariancji lub korelacji.
Posluzymy sie danymi pochodzacymi z badania przeprowadzonego na 123 osobach cierpiacych
z powodu silnych napadow bolu. Poproszono ich o wydanie opinii na skali od 1 do 6
(1-calkowicie sie zgadzam, 6-nie zgadzam sie) na temat 9 oswiadczen na temat bolu.

Ponizej lista zmiennych:
1. To, czy bede cierpial z powodu bolu w przyszlosci zalezy od lekarza.
2. To, czy bede cierpial z powodu bolu, zalezy zwykle od tego, czy cos zrobilem lub nie
   zrobilem.
3. To, czy bede cierpial z powodu bolu, zalezy od tego, co zrobi dla mnie lekarz.
4. Nie moge poradzic sobie z bolem, dopoki nie skorzystam z pomocy medycznej.
5. Jesli czuje bol, to jest to spowodowane tym, iz nie wykonywalem odpowiednich cwiczen lub
   nieprawidlowo sie odzywialem.
6. Bol jest wynikiem zaniedbania.
7. Jestem calkowicie odpowiedzialny za moj bol.
8. Pozbycie sie bolu jest kontrolowane przez doktora.
9. Ludzie, ktorzy nigdy nie cierpia z powodu bolu, sa szczesciarzami.*/

In [None]:
%%stata -qui
matrix C = ( 1.0000, -0.0385, 0.6066, 0.4507, 0.0320, -0.2877, -0.2974, 0.4526, 0.2952 \/*
*/ -0.0385, 1.0000, -0.0693, -0.1167, 0.4881, 0.4271, 0.3045, -0.3090, -0.1704 \/*
*/ 0.6066, -0.0693, 1.000, 0.5916, 0.0317, -0.1336, -0.2404, 0.5886, 0.3165 \ /*
*/0.4507, -0.1167, 0.5916, 1.0000, -0.0802,  -0.2073, -0.1850, 0.6286, 0.3680 \ /*
*/0.0320, 0.4881, 0.0317, -0.0802, 1.0000, 0.4731, 0.4138, -0.1397, -0.2367 \ /*
*/-0.2877, 0.4271, -0.1336, -0.2073, 0.4731, 1.0000, 0.6346, -0.1329, -0.1541 \ /*
*/-0.2974, 0.3045, -0.2404, -0.1850, 0.4138, 0.6346, 1.0000, -0.2599, -0.2893 \ /*
*/0.4526, -0.3090, 0.5886, 0.6286, -0.1397, -0.1329, -0.2599, 1.0000, 0.4047 \ /*
*/0.2952, -0.1704, 0.3165, 0.3680, -0.2367, -0.1541, -0.2893, 0.4047, 1.0000 )

Test Ilorazu Wiarogodności (ang. LR test) https://www.jstor.org/stable/2287400

In [None]:
%%stata
/*Nie musimy nigdzie okreslac, iz na wejsciu mamy dane w postaci macierzy
korelacji. Rozpoczniemy od 2 czynnikow*/
/*Jesli wykorzystujemy dane w postaci macierzy korelacji, musimy okreslic liczbe obserwacji*/

factormat C, n(123) names(p1 p2 p3 p4 p5 p6 p7 p8 p9) fac(2) ml

In [None]:
C = np.array([[ 1.0000, -0.0385, 0.6066, 0.4507, 0.0320, -0.2877, -0.2974, 0.4526, 0.2952],
[-0.0385, 1.0000, -0.0693, -0.1167, 0.4881, 0.4271, 0.3045, -0.3090, -0.1704],
[0.6066, -0.0693, 1.000, 0.5916, 0.0317, -0.1336, -0.2404, 0.5886, 0.3165],
[0.4507, -0.1167, 0.5916, 1.0000, -0.0802,  -0.2073, -0.1850, 0.6286, 0.3680],
[0.0320, 0.4881, 0.0317, -0.0802, 1.0000, 0.4731, 0.4138, -0.1397, -0.2367],
[-0.2877, 0.4271, -0.1336, -0.2073, 0.4731, 1.0000, 0.6346, -0.1329, -0.1541],
[-0.2974, 0.3045, -0.2404, -0.1850, 0.4138, 0.6346, 1.0000, -0.2599, -0.2893],
[0.4526, -0.3090, 0.5886, 0.6286, -0.1397, -0.1329, -0.2599, 1.0000, 0.4047],
[0.2952, -0.1704, 0.3165, 0.3680, -0.2367, -0.1541, -0.2893, 0.4047, 1.0000]])

In [None]:
np.linalg.eigvals(C) # macierz oddatnio okreslona

In [None]:
fa = FactorAnalyzer(rotation = None, is_corr_matrix = True, n_factors = 2, method = 'ml')
fa.fit(C)

# GET EIGENVALUES
# Large values of the communalities will indicate that the fitting hyperplane (factors) is rather accurately reproducing the correlation matrix. 
fa.get_uniquenesses(), fa.get_communalities()

In [None]:
fa.get_uniquenesses() + fa.get_communalities()

In [None]:
fa.get_factor_variance()[0], fa.get_eigenvalues()[1][0:2]

In [None]:
nams = [ "p" + i for i in list("123456789")]
loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))
loadings.index = nams
loadings.columns = ["Factor1", "Factor2", "uniquenesses"] 
loadings

Truncated SVD Directly

In [None]:
tsvd = TruncatedSVD(2)
tsvd.fit(C)
loadings_direct_svd = tsvd.components_.T * np.sqrt(tsvd.explained_variance_)

Correlation between FA loadings and direct Truncated SVD loadings

In [None]:
np.diag(
    corr_mat(
        loadings_direct_svd,
        fa.loadings_
    )
)

In [None]:
%%stata

/*Wyniki testu -- okazuje sie ze 2 czyniki nie wystarcza (p-value = 0.0000<0.05)*/
//Probujemy z 3 czynnikami

factormat C, n(123) names(p1 p2 p3 p4 p5 p6 p7 p8 p9) fac(3) ml

/*Na poziomie istotnosci 0,05 brak podstaw do odrzucenia H0 zakladajacej, ze model
trzyczynnikowy jest adekwatny (wystarczajacy). p-value 0.1055>0.05*/

In [None]:
fa = FactorAnalyzer(rotation = None, is_corr_matrix = True, n_factors = 3, method = 'ml')
fa.fit(C)

In [None]:
fa.get_factor_variance()

In [None]:
fa.get_eigenvalues()

In [None]:
pd.Series(fa.get_eigenvalues()[1]).plot()

In [None]:
nams = [ "p" + i for i in list("123456789")]
loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))
loadings.index = nams
loadings.columns = ["Factor1", "Factor2", "Factor3", "uniquenesses"] 
loadings

In [None]:
%%stata
/*Sprobujemy nadac czynnikom interpretacje. Przeprowadzamy rotacje czynnikow*/
rotate, varimax

In [None]:
fa = FactorAnalyzer(rotation='varimax', is_corr_matrix = True, n_factors = 3, method = 'ml')
fa.fit(C)

In [None]:
#GET EIGENVALUES
fa.get_uniquenesses(),fa.get_communalities()

In [None]:
nams = [ "p" + i for i in list("123456789")]
loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))
loadings.index = nams
loadings.columns = ["Factor1", "Factor2", "Factor3", "uniquenesses"] 
loadings

In [None]:
fa.rotation_matrix_

In [None]:
np.matmul(fa.rotation_matrix_.T, fa.rotation_matrix_)

In [None]:
# https://www.tandfonline.com/doi/abs/10.1080/10705510701301891?journalCode=hsem20

In [None]:
%%stata
/*pierwszy czynnik - stwierdzenia 1, 3, 4 i 8 - wszystkie zwiazane z lekarzami; mozemy  zinterpretowac jako "kontrola lekarska bolu"
  drugi czynnik - stwierdzenia 6 i 7 - bol jako wynik wlasnych dzialan
  trzeci czynnik - stwierdzenia 2 i 5 - znow bol jako wynik wlasnych dzialan.*/

estat smc
/*oszacowanie czesci wspólnej ->"communality" (jaka czesc zmiennej Xi  jest zwiazana z pozostalymi zmiennymi X)
szacowna jako kwadrat wspolczynnika korelacji wielorakiej
danej zmiennej z pozostalymi (czyli R2 z regresji tej zmiennej na pozostale)*/

estat kmo

/*statystyka adekwatnosci proby Kaiser-Meyer-Olkin.
Metoda ta polega na porownaniu korelacji i czastkowych korelacji pomiedzy zmiennymi.
Gdy korelacja czastkowa jest relatywnie wysoka w stosunku do zwyklej korelacji to KMO jest male,
co oznacza ze uzyskanie adekwatnego rozwiazania w przestrzeni malego wymiaru jest niewykonalne.

Wielkosci wspolczynnika:
0.00 to 0.49 nie do przyjecia
0.50 to 0.59 bardzo slaby
0.60 to 0.69 slaby
0.70 to 0.79 umiarkowany
0.80 to 0.89 dobry
0.90 to 1.00 znakomity*/


In [None]:
# calculate_bartlett_sphericity(C) not for correlation matrix

In [None]:
# calculate_kmo(C) not for correlation matrix

### Przyklad 2 -- Indeks kapitalu spolecznego i problemy z analiza czynnikowa

#### Probujemy stworzyc indeks kapitalu spolecznego

Dane oryginalnie pochodzily z badania World Values Survey

In [None]:
from multidim.datasets import load_indeks_spol
F = load_indeks_spol()

In [None]:
%%mata -m F
st_matrix("F", F)

In [None]:
%%stata
//METODA NAJWIEKSZEJ WIARYGODNOSCI
factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(7) ml
//za malo

In [None]:
nams = ["imp_family", "imp_friends", "imp_politics", "imp_church", "member_dis", "political_dis", "trust_family",
        "trust_ppers", "trust_neighbour", "trust_arel", "trust_firsttime", "trust_anation", "fair", "conf_church",
        "conf_forces", "conf_press", "conf_tv", "conf_labour", "conf_police", "conf_courts", "conf_govern", "conf_parties",
        "conf_parl", "religion_freq", "tradition", "help", "local"]

In [None]:
n_factors = 7
fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'ml')
fa.fit(F)
loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))
loadings.index = nams
loadings.columns = [ "Factor" + str(i + 1) for i in range(n_factors)] + ["uniquenesses"] 
loadings

In [None]:
pd.Series(fa.get_eigenvalues()[1]).plot()

In [None]:
%%stata 
factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(9) ml
//HEYWOOD CASE -- negative variance estimate

In [None]:
n_factors = 9
fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'ml')
fa.fit(F)
loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))
loadings.index = nams
loadings.columns = [ "Factor" + str(i + 1) for i in range(n_factors)] + ["uniquenesses"] 
loadings

### Analiza czynnikowa (FA) przy wykorzytaniu Analizy głównych składowych (PCA) (optymalizacja)

In [None]:
#%%stata
#factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(4) pcf
#//za malo
#
#factormat F, n(35312) names(imp_family imp_friends imp_politics imp_church member_dis political_dis trust_family trust_ppers trust_neighbour trust_arel trust_firsttime trust_anation fair conf_church conf_forces conf_press conf_tv conf_labour conf_police conf_courts conf_govern conf_parties conf_parl religion_freq tradition help local) fac(27) pcf
#//tez nie

In [None]:
# n_factors = 10
# fa = FactorAnalyzer(rotation=None, is_corr_matrix = True, n_factors = n_factors, method = 'principal')
# fa.fit(F)
# loadings = pd.DataFrame(np.column_stack((fa.loadings_, fa.get_uniquenesses())))
# loadings.index = nams
# loadings.columns = [ "Factor" + str(i + 1) for i in range(n_factors)] + ["uniquenesses"] 
# loadings

## Przyklad 3 - Scores

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler, scale
from scipy.stats import rankdata

from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo

In [None]:
from multidim.datasets import load_seul1988
seul1988 = load_seul1988()
seul1988 = seul1988.sample(seul1988.shape[0])
seul1988 = seul1988.query("wynik >= 6000")
seul1988["Subject"] = list(range(1, seul1988.shape[0]+1, 1))
seul1988_copy = seul1988.copy()

In [None]:

seul1988 = seul1988[~np.isnan(seul1988).any(axis=1)]
seul1988_normal = scale(seul1988)
fa = FactorAnalysis(n_components = 3, tol = 0.001, svd_method = "lapack")
X = seul1988_normal[:, :-2]
fa.fit(X)

In [None]:
comps = fa.transform(X)

Interpretacja dla kazdego czynnika ...

In [None]:
cc = corr_mat(comps, seul1988.iloc[:, :-2]).T
cc

Jest duzo roznych metod budowania rankingu.

In [None]:
scores1a = comps[:,0]
order1a = rankdata(scores1a, "max")

In [None]:
scores1b = comps.sum(axis = 1)
order1b = rankdata(scores1b, "max")

In [None]:
scores2 = fa.score_samples(X)
order2 = rankdata(scores2, "max")

In [None]:
fa2 = FactorAnalysis(n_components = 1, tol = 0.001, svd_method = "lapack")
fa2.fit(X)
comps = fa2.transform(X)
scores1c = comps.ravel()
order1c = rankdata(scores1c, "max")

In [None]:
res = pd.DataFrame({
    'subject': seul1988['Subject'],
    'wynik': seul1988['wynik'], 
    'scores_one': scores1c,
    'rank_one': order1c,
    'scores_first': scores1a,
    'rank_first': order1a,
    'scores_sum': scores1b,
    'rank_sum': order1b,
    'scores_loglike': scores2,
    'rank_loglike': order2
})
res['rank_wynik'] = rankdata(res["wynik"].values, "max")
res

Correlation between different rankings - ABS is needed as we do not know sense (zwrot)

In [None]:
from multidim.funs import corr_mat
rank_cols = ["rank_one", "rank_first", "rank_sum", "rank_loglike", "rank_wynik"]
# ABS
np.abs(np.corrcoef(res.loc[:,rank_cols].T))[-1,:]