In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from itables import init_notebook_mode, show
from collections import Counter
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from collections import defaultdict
from sklearn import metrics
from time import time

## Exercice personnel de clusterisation de textes


* code issu de cet exemple: https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html

### Application à mes données —— commence ici

In [3]:
f = "book_titles.csv"
txt_astro = pd.read_csv(f, sep="|")

In [4]:
txt_astro.head()

Unnamed: 0,pk_idref_books,book_date,simplified_title
0,1,2001,Structure interpretation classical mechanics
1,2,1926,motion distribution stars
2,3,1968,Catalog emission lines astrophysical objects
3,4,1969,Catalog emission lines astrophysical objects
4,5,1976,Applied solar energy introduction


In [5]:
txt_astro.simplified_title[-10:-5]

5521                    rôle Société Astronomique France
5522                                     أجمل تاريخ للكون
5523    Comment pensent savants leçons grands scientif...
5524                                   Hermann Carl Vogel
5525    L'Université l'usine (1956-1984) fondateur L...
Name: simplified_title, dtype: object

### Not in use


In [None]:
def ab(text):
    for ch in ['\\','\n','?','*','{','}','[',']','(',')','>','#',';',':','.','!','$','\'']:
        text = text.replace(ch,"")
    return text    

In [None]:
print(ab('aéslfk? js#aélfk: jaésd$d,'))

In [None]:
def suppr_math(text):
    for ch in ['mathematics','mathematical','mathematician']:
        text = text.replace(ch,"")
    return text   

In [None]:
txt_astro.head(2)

### Follows from here the code

In [9]:
txt_astro['title'] = txt_astro['simplified_title'].apply(lambda x : x.lower().strip())

In [11]:
txt_astro.title[-10:-5]

5521                    rôle société astronomique france
5522                                     أجمل تاريخ للكون
5523    comment pensent savants leçons grands scientif...
5524                                   hermann carl vogel
5525    l'université l'usine (1956-1984) fondateur l...
Name: title, dtype: object

In [32]:
txt_astro['txt_len'] = txt_astro['title'].apply(lambda x : len(x))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)

In [13]:
t0 = time()
X_tfidf = vectorizer.fit_transform(txt_astro.title)

print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

vectorization done in 0.060 s
n_samples: 5531, n_features: 1699


In [None]:
### https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vectorizer.get_feature_names_out()[:5]

In [None]:
vectorizer.ngram_range

In [None]:
? X_tfidf

In [None]:
X_tfidf.data,X_tfidf.indices,X_tfidf.data 

In [16]:
print(f"{X_tfidf.nnz / np.prod(X_tfidf.shape):.3f}")

0.003


In [24]:
from sklearn.cluster import KMeans

for seed in range(5):
    kmeans = KMeans(
        n_clusters=20,
        max_iter=500,
        n_init=50,
        random_state=seed,
    ).fit(X_tfidf)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    print(f"Number of elements asigned to each cluster: {cluster_sizes}")
print()


Number of elements asigned to each cluster: [ 133   86   43   96  170   67   88  210   87   48  116  167  134   85
   41  136  113  128 3504   79]
Number of elements asigned to each cluster: [3516  183   90   64   81  103   66  161    8  233  117  126  141   30
   88   38  240   54  131   61]
Number of elements asigned to each cluster: [ 104 3631  169  160   53  116  225  125   55   45   52   91  133   89
   84  202   73   20   75   29]
Number of elements asigned to each cluster: [ 101  135  126   58  179   16  198   74 3634  126   60   53  102   85
  126   72   66  178   90   52]
Number of elements asigned to each cluster: [ 156  127  100   89  249   22   42   85 3536   55  133  243   91   75
  171   38   23  112  101   83]



In [25]:
y_tfidf = kmeans.predict(X_tfidf)

In [None]:
y_tfidf[:10]

In [27]:
lines_classes =  ['_'+ str(i+1) if i < 9 else str(i+1) for i in y_tfidf]; lines_classes[:3]

['_9', '_5', '_9']

In [None]:
len(lines_classes), len(txt_astro)

In [33]:
txt_astro.head()

Unnamed: 0,pk_idref_books,book_date,simplified_title,title,class_search,class,txt_len
0,1,2001,Structure interpretation classical mechanics,structure interpretation classical mechanics,_9,8,44
1,2,1926,motion distribution stars,motion distribution stars,_5,4,25
2,3,1968,Catalog emission lines astrophysical objects,catalog emission lines astrophysical objects,_9,8,44
3,4,1969,Catalog emission lines astrophysical objects,catalog emission lines astrophysical objects,_9,8,44
4,5,1976,Applied solar energy introduction,applied solar energy introduction,19,18,33


In [36]:
f = "book_titles_clusters.csv"
txt_astro.to_csv(f, sep='|')

In [34]:
txt_astro['class_search'] = lines_classes; txt_astro['class'] = y_tfidf 

In [35]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(txt_astro.sort_values(by='txt_len'), classes="display", scrollY="500px", 
     scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,pk_idref_books,book_date,simplified_title,title,class_search,class,txt_len
Loading... (need help?),,,,,,,


In [None]:
txt_astro['tokens'] = txt_astro['summ_repl'].apply(lambda x : x.split(' '))

In [None]:
dfg = pd.DataFrame(txt_astro.groupby('class')['tokens'].apply(list))
dfg.columns = ['tokens']
dfg = dfg.reset_index()
dfg.head(2)

In [None]:
def most_frequent_tokens(l):
    sl = sum(l, [])
    kl = [tk.replace('\n', '') \
          #.replace('.', '').replace(',', '').strip().lower() \
          for tk in sl if len(tk)>3]
          #for tk in sl if len(tk)>3 and tk not in ('mathematician','worked', 'work', 'known')]
    skl = pd.Series(Counter(kl))
    fskl = skl.loc[lambda x : x > 3].sort_values(ascending=False)
    final_l = fskl.iloc[:20].to_dict()
    
    return final_l

In [None]:
dfg['mf_tokens'] = dfg['tokens'].apply(lambda x : most_frequent_tokens(x))

In [None]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(dfg, classes="display", scrollY="400px", 
     scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

### Produce graph

In [None]:
fl = dfg['mf_tokens'][0]; fl

In [None]:
l = dfg['tokens'][1][0]; l[:7]

In [None]:
[i for i in l if i in fl.keys()]

In [None]:
sorted(['William', 'Morgan', 'was', 'an', 'early', 'actuary'])

In [None]:
l =  ['Morgan', 'William', 'actuary', 'an', 'early', 'was']

In [None]:
def tks_graph(l, fl):
    lfk = list(fl.keys())

    le_dct= []
    for e in l: 
        l_dct= []
        sl = sorted([i for i in e if len(i) > 3 and i in lfk ])
        for i in sl:
            for v in sl:
                if i < v:
                    l_dct.append([i,v])
        #print(l_dct)
        le_dct += l_dct            
    return le_dct

In [None]:
dfg['tokens_graph'] = dfg.apply(lambda x : tks_graph(x.tokens,x.mf_tokens), axis=1)
dfg.head()

In [None]:
dfg.tokens_graph[:3]

In [None]:
ll = [tuple(e) for e in dfg.loc[0]['tokens_graph']]; # ll

In [None]:
fl = dfg['mf_tokens'][0]; fl

In [None]:
d_eff = dict([(k, {'eff':v}) for k,v in fl.items()]); d_eff

In [None]:
G = nx.Graph()
G.add_edges_from(ll)

In [None]:
G.nodes()

In [None]:
nx.set_node_attributes(G, d_eff)

# inspecter les sommets et leurs attributs
list(G.nodes.data())

In [None]:
nx.get_node_attributes(G, 'eff').values()

In [None]:
[i]

In [None]:
pos = nx.kamada_kawai_layout(G)
fig = plt.figure(figsize = (12,9))
fig.tight_layout()
# nx.draw_networkx(G, pos, node_size=10)
nx.draw_networkx_edges(G, pos, alpha=0.1)
nx.draw_networkx_labels(G, pos, horizontalalignment='left', verticalalignment='bottom')
nx.draw_networkx_nodes(G, pos, node_size= list(nx.get_node_attributes(G, 'eff').values()))


In [None]:
# https://engineeringfordatascience.com/posts/matplotlib_subplots/
# https://matplotlib.org/stable/gallery/subplots_axes_and_figures/subplot.html#sphx-glr-gallery-subplots-axes-and-figures-subplot-py


plt.subplots_adjust( hspace=10)
ncols = 1
# calculate number of rows
nrows = len(dfg) 

i = 0






while i < nrows :

    val = pd.DataFrame.from_dict(dfg.iloc[i]['mf_tokens'], orient='index')
    val.columns=['eff']
    val['y_val'] = [1]*len(val)
    # val['ln_eff'] = val['eff'].apply(lambda x : np.log(x)) ça n'apporte rien


    # print(val.eff)
    
    # ax= plt.subplot(nrows,1,i+1)
    
    ax = plt.figure(figsize=(14,3))



    ax = sns.scatterplot(val, x = 'eff', y = 'y_val', alpha = 0.5)
    
    # plt.xlabel(val['eff'].to_dict(), fontsize=14, wrap=True)
    
    ax.collections[0].set_sizes(val.eff**2) 

    ax.set_xlim(min(val.eff)- min(val.eff)/10, max(val.eff)+max(val.eff)/20)
    ax.set_ylim(0, 2)
    ax.invert_xaxis()

    #For each point, we add a text inside the bubble
    for line in range(0,val.shape[0]):
         ax.text(val.eff[line], val.y_val[line]+ 0.2, str(val.index[line]).replace('{}', ''),
                 rotation=45,horizontalalignment='center', 
                 size=10, color='black') # , weight='semibold'

    ax.set_title(f'Cluster {i}', fontsize=18)

    plt.tight_layout()   

    plt.show()
    
    
    
    ### 
    ll = [tuple(e) for e in dfg.iloc[i]['tokens_graph']]
    fl = dfg['mf_tokens'][i]
    d_eff = dict([(k, {'eff':v}) for k,v in fl.items()])
    G = nx.Graph()
    G.add_edges_from(ll)
    nx.set_node_attributes(G, d_eff)
    
    d = sorted(nx.get_node_attributes(G, 'eff').items(), key=lambda x: x[1], reverse=True)
    print(d)
    
    pos = nx.kamada_kawai_layout(G)
    fig = plt.figure(figsize = (6,4))
    fig.tight_layout()
    # nx.draw_networkx(G, pos, node_size=10)
    nx.draw_networkx_edges(G, pos, alpha=0.1)
    nx.draw_networkx_labels(G, pos, horizontalalignment='left', verticalalignment='bottom')
    nx.draw_networkx_nodes(G, pos, node_size= list(nx.get_node_attributes(G, 'eff').values()))


    ax.set_title(f'Cluster {i}', fontsize=18)

    plt.tight_layout()   
    i = i + 1
    plt.show()


    


### Les 20 mots les plus fréquents par classe

In [None]:
# https://engineeringfordatascience.com/posts/matplotlib_subplots/
# https://matplotlib.org/stable/gallery/subplots_axes_and_figures/subplot.html#sphx-glr-gallery-subplots-axes-and-figures-subplot-py


plt.subplots_adjust( hspace=10)
ncols = 1
# calculate number of rows
nrows = len(dfg) 

i = 0






while i < nrows :

    ### 
    val = pd.DataFrame.from_dict(dfg.iloc[i]['mf_tokens'], orient='index')
    val.columns=['eff']
    val['y_val'] = [1]*len(val)
    # val['ln_eff'] = val['eff'].apply(lambda x : np.log(x)) ça n'apporte rien


    # print(val.eff)
    
    # ax= plt.subplot(nrows,1,i+1)
    
    ax = plt.figure(figsize=(30,6))



    ax = sns.scatterplot(val, x = 'eff', y = 'y_val', alpha = 0.5)
    
    plt.xlabel(val['eff'].to_dict(), fontsize=14, wrap=True)
    
    ax.collections[0].set_sizes(val.eff**2) 

    ax.set_xlim(min(val.eff)- min(val.eff)/10, max(val.eff)+max(val.eff)/20)
    ax.set_ylim(0, 2)
    ax.invert_xaxis()

    #For each point, we add a text inside the bubble
    for line in range(0,val.shape[0]):
         ax.text(val.eff[line], val.y_val[line]+ 0.2, str(val.index[line]).replace('{}', ''),
                 rotation=45,horizontalalignment='center', 
                 size=16, color='black') # , weight='semibold'

    ax.set_title(f'Cluster {i}', fontsize=18)

    plt.tight_layout()   
    i = i + 1
    plt.show()


    


## Préparation 

In [None]:
val = pd.DataFrame.from_dict((dfg.iloc[7].values)[0], orient='index'); type(val)
val.columns=['eff']
val['y_val'] = [1]*len(val)
# val['ln_eff'] = val['eff'].apply(lambda x : np.log(x)) ça n'apporte rien


In [None]:
fig = plt.figure(figsize=(20, 4))


val = pd.DataFrame.from_dict((dfg.iloc[0].values)[0], orient='index'); type(val)
val.columns=['eff']
val['y_val'] = [1]*len(val)
# val['ln_eff'] = val['eff'].apply(lambda x : np.log(x)) ça n'apporte rien


ax = sns.scatterplot(val, x = 'eff', y = 'y_val', alpha = 0.5)
ax.set(xlabel=val['eff'].to_dict())
ax.collections[0].set_sizes(val.eff**2) 

ax.set_xlim(min(val.eff)- min(val.eff)/10, max(val.eff)+max(val.eff)/20)
ax.set_ylim(0, 2)
ax.invert_xaxis()

#For each point, we add a text inside the bubble
for line in range(0,val.shape[0]):
     ax.text(val.eff[line], val.y_val[line]+ 0.2, str(val.index[line]).replace('{}', ''),
             rotation=45,horizontalalignment='center', 
             size=12, color='black') # , weight='semibold'


In [None]:
dfg = pd.DataFrame(txt_astro.groupby('class')['tokens'].apply(list))
dfg.columns = ['tokens']

In [None]:
# dfg.head(8), type(dfg)

In [None]:
dfg['tokens'] = dfg['tokens'].apply(lambda x : most_frequent_tokens(x))

In [None]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(dfg, classes="display", scrollY="400px", 
     scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

In [None]:
l_dct= []
for i in dct:
    for v in dct:
        if i < v:
            l_dct.append([i,v])
l_dct[:5]            

In [None]:
G = nx.Graph()
G.add_edges_from(l_dct)

In [None]:
pos = nx.spring_layout(G)
fig = plt.figure(figsize = (30,20))
fig.tight_layout()
nx.draw_networkx(G, pos, node_size=10)

In [None]:
### 
# https://pyvis.readthedocs.io/en/latest/tutorial.html#using-pyvis-within-jupyter-notebook
nt = Network( notebook=True) #, filter_menu=True)
# populates the nodes and edges data structures
nt.from_nx(G)
nt.show('pictures/nx.html' )

In [None]:
def prepare_graph(val):
    dct = {val}
    