In [1]:
%load_ext autoreload
%autoreload 2

### Building a cooccurrence matrix to deal with textual data using Reuters corpus

In [2]:
# import nltk
# nltk.download('brown')
from sinr.text.cooccurrence import Cooccurrence
from sinr.text.pmi import pmi_filter
from nltk.corpus import brown
from nltk.corpus import reuters
c = Cooccurrence()
c.fit(reuters.sents(), window=100)
c.matrix = pmi_filter(c.matrix)
c.save("matrix.pk")

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


### Creating a SINr object from the cooccurrence matrix pickle, and detecting communities on the graph extracted from this matrix

In [3]:
import sinr.graph_embeddings as ge
sinr = ge.SINr.load_from_cooc_pkl("matrix.pk")
communities = sinr.detect_communities(gamma=50)

2023-01-08 16:40:22,004 - load_from_cooc_pkl - INFO - Building Graph.
2023-01-08 16:40:22,004 - load_pkl_text - INFO - Loading cooccurrence matrix and dictionary.
2023-01-08 16:40:22,023 - load_pkl_text - INFO - Finished loading data.
2023-01-08 16:40:23,457 - load_from_cooc_pkl - INFO - Finished building graph.
2023-01-08 16:40:23,461 - detect_communities - INFO - Detecting communities.
  warn("networkit.Timer is deprecated, will be removed in future updates.")
2023-01-08 16:40:24,213 - detect_communities - INFO - Finished detecting communities.


Communities detected in 0.67973 [s]
solution properties:
-------------------  -------------
# communities        4454
min community size      1
max community size    138
avg. community size     9.33969
imbalance              13.8
edge cut                2.4586e+07
edge cut (portion)      6.90061
modularity              0.0264611
-------------------  -------------


### Extracting the embeddings using the SINr approach

In [4]:
sinr.extract_embeddings(communities)

2023-01-08 16:40:24,233 - extract_embeddings - INFO - Extracting embeddings.
2023-01-08 16:40:24,234 - extract_embeddings - INFO - Applying NFM.
2023-01-08 16:40:24,234 - get_nfm_embeddings - INFO - Starting NFM
  weighted_membership = membership_matrix.multiply(np.reciprocal(community_weights).astype('float')) # 1/community_weight for each column of the membership matrix
2023-01-08 16:40:36,656 - extract_embeddings - INFO - NFM successfully applied.
2023-01-08 16:40:36,656 - extract_embeddings - INFO - Finished extracting embeddings.


### Using the ModelBuilder to get SINrVectors, a model that one can play with. Here we decide to use the node recall embedding. No additional i

In [5]:
sinr_vectors = ge.ModelBuilder(sinr, "reuters", n_jobs=8, n_neighbors=5).with_embeddings_nr().build()

In [6]:
sinr_vectors.most_similar(1)

{'object ': 1,
 'neighbors ': [(41261, 0.6920489140338091),
  (26133, 0.6775287038469705),
  (40033, 0.6722449348486441),
  (27788, 0.6705612193241263)]}

### Here, the ModelBuilder is used to keep the interpretability of the approach (using with_communities) and it allows also to use the words instead of integers (with_vocabulary)

In [7]:
sinr_vectors = ge.ModelBuilder(sinr, "reuters", n_jobs=8, n_neighbors=15).with_embeddings_nr().with_vocabulary().with_communities().build()

In [8]:
sinr_vectors.most_similar("trading")

{'object ': 'trading',
 'neighbors ': [('traded', 0.6048531603246535),
  ('closed', 0.5736167211531841),
  ('futures', 0.5428829157227548),
  ('Friday', 0.5148830969733745),
  ('Tuesday', 0.5102848661049046),
  ('Stock', 0.4864170645885969),
  ('put', 0.4834804475753083),
  ('Thursday', 0.45556018105821106),
  ('trades', 0.4537719078191018),
  ('contracts', 0.44670604146181847),
  ('opening', 0.43885236715964204),
  ('London', 0.4355552892309089),
  ('Wednesday', 0.43333758062672867),
  ('firmer', 0.43260926214252793)]}

### An InterpretableWordsModelBuilder could also be used for shorter code : no "with" to use, those are done by default

In [30]:
sinr_vectors = ge.InterpretableWordsModelBuilder(sinr, "reuters", n_jobs=8, n_neighbors=15).build()

In [31]:
sinr_vectors.most_similar("trading")

{'object ': 'trading',
 'neighbors ': [('traded', 0.6048531603246535),
  ('closed', 0.5736167211531841),
  ('futures', 0.5428829157227548),
  ('Friday', 0.5148830969733745),
  ('Tuesday', 0.5102848661049046),
  ('Stock', 0.4864170645885969),
  ('put', 0.4834804475753083),
  ('Thursday', 0.45556018105821106),
  ('trades', 0.4537719078191018),
  ('contracts', 0.44670604146181847),
  ('opening', 0.43885236715964204),
  ('London', 0.4355552892309089),
  ('Wednesday', 0.43333758062672867),
  ('firmer', 0.43260926214252793)]}

### What is the community which "oils" belongs to ?

In [41]:
sinr_vectors.get_dimension_descriptors("liberal")

['alliance',
 'Democrats',
 'legislators',
 'signal',
 'sought',
 'Republicans',
 'defeat',
 'winning',
 'SPARKS',
 'revolt',
 'oppose',
 'minute',
 'warmly',
 'liberal',
 'conservative',
 'TARGETING']

In [40]:
sinr_vectors.get_dimension_descriptors("liberal", topk=5)

[(0.1644736842105263, 'revolt'),
 (0.13157894736842107, 'alliance'),
 (0.11979166666666667, 'defeat'),
 (0.10697674418604651, 'legislators'),
 (0.1033434650455927, 'Republicans')]

### What are the dimensions useful to describe "oils" and what are the words that belong to the corresponding communities ?

In [39]:
sinr_vectors.get_obj_descriptors("liberal", topk_dim=3,topk_val=5)

[{'dimension': 2863,
  'value': 0.09036144578313254,
  'descriptors': [(0.1644736842105263, 'revolt'),
   (0.13157894736842107, 'alliance'),
   (0.11979166666666667, 'defeat'),
   (0.10697674418604651, 'legislators'),
   (0.1033434650455927, 'Republicans')]},
 {'dimension': 2606,
  'value': 0.045180722891566265,
  'descriptors': [(0.3125, 'Studebaker'),
   (0.3125, 'Chevy'),
   (0.3125, 'Dealerships'),
   (0.23529411764705882, 'amber'),
   (0.23529411764705882, 'coloured')]},
 {'dimension': 2586,
  'value': 0.04216867469879518,
  'descriptors': [(0.4148936170212766, 'Formosa'),
   (0.40259740259740256, 'Misiones'),
   (0.3773584905660377, 'Chaco'),
   (0.36813186813186816, 'Rios'),
   (0.36792452830188677, 'Corrientes')]}]

### What are the words that have high values on the dimension corresponding to the community wine belongs to ?

In [42]:
sinr_vectors.get_dimension_stereotypes("wine", topk=10)

['Beam',
 'Distilling',
 'SPIRITS',
 'DISTILLERS',
 'DR',
 'Vineyards',
 'MCGINNESS',
 'whiskey',
 'Grandad',
 'gin']

It can be compared to the words that constitute the community

In [43]:
sinr_vectors.get_dimension_descriptors("wine")

['Distillers',
 'DR',
 'Beam',
 'DISTILLERS',
 'Distilling',
 'Enron',
 'wine',
 'AMB',
 'SPIRITS',
 'spirits',
 'netted',
 '545',
 'Vineyards']

### What are the 3 words that have high values on the 5 dimensions that are useful to describe "liberal" ?

In [46]:
sinr_vectors.get_obj_stereotypes("liberal", topk_dim=5, topk_val=3)

[{'dimension': 2863,
  'value': 0.09036144578313254,
  'stereotypes': ['revolt', 'alliance', 'defeat']},
 {'dimension': 2606,
  'value': 0.045180722891566265,
  'stereotypes': ['Dealerships', 'Chevy', 'Studebaker']},
 {'dimension': 2586,
  'value': 0.04216867469879518,
  'stereotypes': ['Formosa', 'Misiones', 'Chaco']},
 {'dimension': 386,
  'value': 0.027108433734939756,
  'stereotypes': ['Kippur', 'Yom', 'pullout']},
 {'dimension': 4447,
  'value': 0.018072289156626505,
  'stereotypes': ['Rumours', 'acknowledging', 'responsibly']}]

It can be compared with the words that constitute the communities representing these dimensions

In [48]:
sinr_vectors.get_obj_stereotypes_and_descriptors("liberal", topk_dim=5, topk_val=3)

[{'dimension': 2863,
  'value': 0.09036144578313254,
  'stereotypes': ['revolt', 'alliance', 'defeat'],
  'descriptors': [(0.1644736842105263, 'revolt'),
   (0.13157894736842107, 'alliance'),
   (0.11979166666666667, 'defeat')]},
 {'dimension': 2606,
  'value': 0.045180722891566265,
  'stereotypes': ['Dealerships', 'Chevy', 'Studebaker'],
  'descriptors': [(0.3125, 'Studebaker'),
   (0.3125, 'Chevy'),
   (0.3125, 'Dealerships')]},
 {'dimension': 2586,
  'value': 0.04216867469879518,
  'stereotypes': ['Formosa', 'Misiones', 'Chaco'],
  'descriptors': [(0.4148936170212766, 'Formosa'),
   (0.40259740259740256, 'Misiones'),
   (0.3773584905660377, 'Chaco')]},
 {'dimension': 386,
  'value': 0.027108433734939756,
  'stereotypes': ['Kippur', 'Yom', 'pullout'],
  'descriptors': [(0.15789473684210525, 'Yom'),
   (0.15789473684210525, 'Kippur'),
   (0.14414414414414412, 'pullout')]},
 {'dimension': 4447,
  'value': 0.018072289156626505,
  'stereotypes': ['Rumours', 'acknowledging', 'responsibly'

### Saving and loading SINrVectors objects

In [49]:
sinr_vectors.save()

In [50]:
# The name should be the same, it is used to load the pickle
sinr_vectors_new = ge.SINrVectors("reuters")
sinr_vectors_new.load()

In [51]:
sinr_vectors_new.get_obj_stereotypes_and_descriptors("liberal", topk_dim=5, topk_val=3)

[{'dimension': 2863,
  'value': 0.09036144578313254,
  'stereotypes': ['revolt', 'alliance', 'defeat'],
  'descriptors': [(0.1644736842105263, 'revolt'),
   (0.13157894736842107, 'alliance'),
   (0.11979166666666667, 'defeat')]},
 {'dimension': 2606,
  'value': 0.045180722891566265,
  'stereotypes': ['Dealerships', 'Chevy', 'Studebaker'],
  'descriptors': [(0.3125, 'Studebaker'),
   (0.3125, 'Chevy'),
   (0.3125, 'Dealerships')]},
 {'dimension': 2586,
  'value': 0.04216867469879518,
  'stereotypes': ['Formosa', 'Misiones', 'Chaco'],
  'descriptors': [(0.4148936170212766, 'Formosa'),
   (0.40259740259740256, 'Misiones'),
   (0.3773584905660377, 'Chaco')]},
 {'dimension': 386,
  'value': 0.027108433734939756,
  'stereotypes': ['Kippur', 'Yom', 'pullout'],
  'descriptors': [(0.15789473684210525, 'Yom'),
   (0.15789473684210525, 'Kippur'),
   (0.14414414414414412, 'pullout')]},
 {'dimension': 4447,
  'value': 0.018072289156626505,
  'stereotypes': ['Rumours', 'acknowledging', 'responsibly'