In [79]:
# DEVOIR 2
# PARTIE IV: VISUALISATION ET REPRÉSENTATIONS
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE

In [45]:
data=pd.read_csv('dataB.csv', index_col=0)
x = data.values
countries = data.index.values
countries = pd.DataFrame(countries, columns=['COUNTRY'])
countries

Unnamed: 0,COUNTRY
0,Afghanistan
1,Albania
2,Algeria
3,Angola
4,Argentina
...,...
152,Venezuela
153,Vietnam
154,Yemen
155,Zambia


In [112]:
def pca_data(x, components):
    pca = PCA(n_components = components)
    pc = pca.fit_transform(x)
    
    if components == 2:
        pc_df = pd.DataFrame(data = pc, columns=['PC1', 'PC2'])
    else:
        pc_df = pd.DataFrame(data = pc, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
    
    pc_df = pd.concat([countries, pc_df],axis = 1)
    return pc_df

In [113]:
def isomap_data(x, components, neighbors):
    isomap = Isomap(n_components = components, n_neighbors = neighbors)
    isomap_data = isomap.fit_transform(x)
    isomap_df = pd.DataFrame(data = isomap_data, columns=['PC1', 'PC2'])
    isomap_df = pd.concat([countries, isomap_df],axis = 1)
    return isomap_df

In [163]:
def tSNE_data(x, components):
    tsne = TSNE(n_components=components, verbose=1, perplexity=40, n_iter=300)
    tsne_data = tsne.fit_transform(x)
    tsne_df = pd.DataFrame(data = tsne_data, columns=['PC1', 'PC2'])
    tsne_df = pd.concat([countries, tsne_df],axis = 1)
    return tsne_df

In [172]:
# 2D PCA Transformation
x = StandardScaler().fit_transform(x)
modified_data1 = pca_data(x, 2)

fig = px.scatter(modified_data1, x='PC1', y='PC2', text='COUNTRY')
fig.update_traces(textposition='top center')
fig.update_layout(title_text='2D PCA Transformation', title_x=0.5)
fig.show()

# We realized that the structure is not linear, more like geodesic.
# Let's try Isomap this time.

In [173]:
np.random.seed(101)
modified_data2 = tSNE_data(pca_data(x,2).iloc[:,1:].values, 2)

fig = px.scatter(modified_data2, x='PC1', y='PC2', text='COUNTRY')
fig.update_traces(textposition='top center')
fig.update_layout(title_text='2D t-SNE Transformation with PCA 2 dimensions', title_x=0.5)
fig.show()

modified_data3 = tSNE_data(pca_data(x,5).iloc[:,1:].values, 2)

fig = px.scatter(modified_data3, x='PC1', y='PC2', text='COUNTRY')
fig.update_traces(textposition='top center')
fig.update_layout(title_text='2D t-SNE Transformation with PCA 5 dimensions', title_x=0.5)
fig.show()

modified_data4 = tSNE_data(x, 2)

fig = px.scatter(modified_data4, x='PC1', y='PC2', text='COUNTRY')
fig.update_traces(textposition='top center')
fig.update_layout(title_text='2D t-SNE Transformation with dataB DataFrame', title_x=0.5)
fig.show()

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 157 samples in 0.001s...
[t-SNE] Computed neighbors for 157 samples in 0.004s...
[t-SNE] Computed conditional probabilities for sample 157 / 157
[t-SNE] Mean sigma: 1.950363
[t-SNE] KL divergence after 250 iterations with early exaggeration: 52.239891
[t-SNE] KL divergence after 500 iterations: 0.182008


[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 157 samples in 0.000s...
[t-SNE] Computed neighbors for 157 samples in 0.005s...
[t-SNE] Computed conditional probabilities for sample 157 / 157
[t-SNE] Mean sigma: 2.647569
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.173691
[t-SNE] KL divergence after 500 iterations: 0.184892


[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 157 samples in 0.001s...
[t-SNE] Computed neighbors for 157 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 157 / 157
[t-SNE] Mean sigma: 3.227584
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.981880
[t-SNE] KL divergence after 500 iterations: 0.234335


In [148]:
# 2D PCA Transformation
x = StandardScaler().fit_transform(x)
modified_data1 = pca_data(x, 2)

fig = px.scatter(modified_data1, x='PC1', y='PC2', text='COUNTRY')
fig.update_traces(textposition='top center')
fig.update_layout(title_text='2D PCA Transformation', title_x=0.5)
fig.show()

# We realized that the structure is not linear, more like geodesic.
# Let's try Isomap this time.

In [None]:
# TODO: Inclure le pourcentage de variance conservée de PCA (2 vs 5 dimensions)