In [1]:
%load_ext autoreload
%autoreload 2

### Preprocessing reuters corpus

In [4]:
# Store the corpus in a raw file
from nltk.corpus import reuters
fichier = open("reuters", "w")
fichier.write(reuters.raw())
fichier.close()

In [7]:
import sinr.text.preprocess as ppcs

In [90]:
# If required, download and install the spacy model used for preprocessin
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
[K     |████████████████████████████████| 400.7 MB 21 kB/s  eta 0:00:012     |██████████████████▌             | 231.3 MB 4.3 MB/s eta 0:00:40
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


#### Creating the vrt file that can be used to play with distinct kind of preprocessing

In [19]:

vrt_maker = ppcs.VRTMaker(ppcs.Corpus(ppcs.Corpus.REGISTER_NEWS, ppcs.Corpus.LANGUAGE_EN, "reuters"), ".", n_jobs=8)
vrt_maker.do_txt_to_vrt()

2023-01-23 11:25:55,993 - do_txt_to_vrt - INFO - 191926lines to preprocess
100%|█████████████████████████████████| 191926/191926 [01:26<00:00, 2211.48it/s]
2023-01-23 11:27:22,794 - do_txt_to_vrt - INFO - VRT-style file written in /home/dugue/Depotgit/sinr/notebooks/reuters.vrt


#### Using the vrt file to create a lemmatized corpus with a min_freq filter, and keeping named entities (en)

In [50]:
sentences = ppcs.extract_text("reuters.vrt", lemmatize=True, min_freq=30, en=True)

100%|█████████████████████████████| 1936485/1936485 [00:12<00:00, 158872.53it/s]


#### Building a cooccurrence matrix to deal with the corpus extracted

In [51]:
from sinr.text.cooccurrence import Cooccurrence
from sinr.text.pmi import pmi_filter
c = Cooccurrence()
c.fit(sentences, window=20)
c.matrix = pmi_filter(c.matrix)
c.save("matrix.pk")

### Creating a SINr object from the cooccurrence matrix pickle, and detecting communities on the graph extracted from this matrix

In [52]:
import sinr.graph_embeddings as ge
sinr = ge.SINr.load_from_cooc_pkl("matrix.pk")
communities = sinr.detect_communities(gamma=10)

2023-01-23 11:34:08,480 - load_from_cooc_pkl - INFO - Building Graph.
2023-01-23 11:34:08,481 - load_pkl_text - INFO - Loading cooccurrence matrix and dictionary.
2023-01-23 11:34:08,482 - load_pkl_text - INFO - Finished loading data.
2023-01-23 11:34:08,546 - load_from_cooc_pkl - INFO - Finished building graph.
2023-01-23 11:34:08,547 - detect_communities - INFO - Detecting communities.
2023-01-23 11:34:08,578 - detect_communities - INFO - Finished detecting communities.


Communities detected in 0.02770 [s]
solution properties:
-------------------  ------------
# communities           392
min community size        1
max community size       23
avg. community size       5.50765
imbalance                 3.83333
edge cut             721430
edge cut (portion)        4.70674
modularity                0.11869
-------------------  ------------


### Extracting the embeddings using the SINr approach

In [53]:
sinr.extract_embeddings(communities)

2023-01-23 11:34:14,932 - extract_embeddings - INFO - Extracting embeddings.
2023-01-23 11:34:14,932 - extract_embeddings - INFO - Applying NFM.
2023-01-23 11:34:14,933 - get_nfm_embeddings - INFO - Starting NFM
2023-01-23 11:34:15,353 - extract_embeddings - INFO - NFM successfully applied.
2023-01-23 11:34:15,353 - extract_embeddings - INFO - Finished extracting embeddings.


### An InterpretableWordsModelBuilder is used to extract the model to play with

In [74]:
sinr_vectors = ge.InterpretableWordsModelBuilder(sinr, "reuters_preprocessed", n_jobs=8, n_neighbors=15).build()

#### Which are the most similar words to "grow" in our corpus ?

In [75]:
sinr_vectors.most_similar("grow")

{'object ': 'grow',
 'neighbors ': [('expand', 0.68),
  ('domestic', 0.67),
  ('rapidly', 0.62),
  ('improve', 0.61),
  ('reduce', 0.6),
  ('this_year', 0.6),
  ('economy', 0.59),
  ('demand', 0.58),
  ('external', 0.58),
  ('turn', 0.58),
  ('huge', 0.57),
  ('consumption', 0.57),
  ("West_Germany's", 0.56),
  ('fast', 0.54)]}

#### What is the community which "grow" belongs to ?

In [76]:
sinr_vectors.get_dimension_descriptors("grow")

{'dimension': 359, 'descriptors': [(0.08, 'fast'), (0.03, 'grow'), (0.03, 'internal'), (0.02, 'increasingly')]}

### What are the dimensions/communities useful to describe "grow" and what are the words that belong to the corresponding communities ?

In [77]:
sinr_vectors.get_obj_descriptors("grow", topk_dim=3,topk_val=5)

[{'dimension': 246,
  'value': 0.06423034330011075,
  'descriptors': [(0.25, 'stimulate'),
   (0.16, 'domestic'),
   (0.15, 'demand'),
   (0.09, 'boost'),
   (0.08, 'economy')]},
 {'dimension': 90,
  'value': 0.06090808416389812,
  'descriptors': [(0.48, 'tariffs'),
   (0.31, 'widen'),
   (0.31, 'merchandise'),
   (0.27, 'narrow'),
   (0.25, 'friction')]},
 {'dimension': 194,
  'value': 0.044296788482835,
  'descriptors': [(0.3, 'gross'),
   (0.18, 'national'),
   (0.12, 'product'),
   (0.1, 'refined'),
   (0.1, 'wood')]}]

#### What are the words that have high values on the dimension corresponding to the community "grow" belongs to ?

In [92]:
sinr_vectors.get_dimension_stereotypes("grow", topk=10)

{'dimension': 359, 'stereotypes': [(0.08, 'fast'), (0.04, 'door'), (0.03, 'M-1'), (0.03, 'courier'), (0.03, 'dependence'), (0.03, 'grow'), (0.03, 'internal'), (0.03, 'ambassador'), (0.03, 'anger'), (0.03, 'restaurant')]}

It can be compared to the words that constitute the community

In [93]:
sinr_vectors.get_dimension_descriptors("grow", topk=10)

{'dimension': 359, 'descriptors': [(0.08, 'fast'), (0.03, 'grow'), (0.03, 'internal'), (0.02, 'increasingly')]}

### What are the 3 words that have high values on the 5 dimensions that are useful to describe "grow" ?

In [94]:
sinr_vectors.get_obj_stereotypes("grow", topk_dim=5, topk_val=3)

[{'dimension': 246,
  'value': 0.06423034330011075,
  'stereotypes': [(0.25, 'stimulate'), (0.16, 'domestic'), (0.15, 'demand')]},
 {'dimension': 90,
  'value': 0.06090808416389812,
  'stereotypes': [(0.48, 'tariffs'), (0.31, 'merchandise'), (0.31, 'widen')]},
 {'dimension': 194,
  'value': 0.044296788482835,
  'stereotypes': [(0.3, 'gross'), (0.18, 'national'), (0.12, 'product')]},
 {'dimension': 277,
  'value': 0.042081949058693245,
  'stereotypes': [(0.09, 'closing'),
   (0.08, 'first_quarter'),
   (0.06, 'widely')]},
 {'dimension': 359,
  'value': 0.028792912513842746,
  'stereotypes': [(0.08, 'fast'), (0.04, 'door'), (0.03, 'M-1')]}]

It can be compared with the words that constitute the communities representing these dimensions

In [81]:
sinr_vectors.get_obj_stereotypes_and_descriptors("grow", topk_dim=5, topk_val=3)

[{'dimension': 246,
  'value': 0.06423034330011075,
  'stereotypes': [(0.25, 'stimulate'), (0.16, 'domestic'), (0.15, 'demand')],
  'descriptors': [(0.25, 'stimulate'), (0.16, 'domestic'), (0.15, 'demand')]},
 {'dimension': 90,
  'value': 0.06090808416389812,
  'stereotypes': [(0.48, 'tariffs'), (0.31, 'merchandise'), (0.31, 'widen')],
  'descriptors': [(0.48, 'tariffs'), (0.31, 'widen'), (0.31, 'merchandise')]},
 {'dimension': 194,
  'value': 0.044296788482835,
  'stereotypes': [(0.3, 'gross'), (0.18, 'national'), (0.12, 'product')],
  'descriptors': [(0.3, 'gross'), (0.18, 'national'), (0.12, 'product')]},
 {'dimension': 277,
  'value': 0.042081949058693245,
  'stereotypes': [(0.09, 'closing'),
   (0.08, 'first_quarter'),
   (0.06, 'widely')],
  'descriptors': [(0.09, 'closing'), (0.06, 'widely'), (0.06, 'soon')]},
 {'dimension': 359,
  'value': 0.028792912513842746,
  'stereotypes': [(0.08, 'fast'), (0.04, 'door'), (0.03, 'M-1')],
  'descriptors': [(0.08, 'fast'), (0.03, 'grow'), (0