In [16]:
import json
import numpy as np
from pprint import pprint
import requests
import nltk.data
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

In [17]:
class JSONObject:
     def __init__(self, d):
         self.__dict__ = d

with open("articles_Accuracy.json", "r") as read_file:
    data = read_file.read()
    obj = json.loads(data,object_hook=JSONObject)      

In [18]:
from nltk.corpus import stopwords
stopworddic = set(stopwords.words('french'))
article=['a','au','aux','un','une','le','la','les','de','des','ce','cet','cette','ces','son','sa','ses','leur','leurs','mon','ma','mes','ton','ta','tes','notre','notres','votre','votres']
pronom=['je','tu','il','elle','nous','vous','ils','elles','on','y','en','se']
coordination=['mais','ou','et','donc','or','ni','car']
v=['aller','vais','vas','va','allez','allions','vont']
stopworddic.update(set(article),set(pronom),set(coordination),set(v))

In [19]:
lemmatizer = FrenchLefffLemmatizer()
def lemma(word):
    a=lemmatizer.lemmatize(word,'all')
    b=[x[1] for x in a]
    if 'nc' in b:
        return lemmatizer.lemmatize(word,'n')
    if 'v' in b:
        return lemmatizer.lemmatize(word,'v')
    return lemmatizer.lemmatize(word)

In [20]:
import re
rawtokens=nltk.word_tokenize(obj[0].body_fr)
metokens = [w.lower() for w in rawtokens if re.search('[a-zA-ZàâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+',w)]
latokens=[]
for str in metokens:
    if(re.search(r'\w+\'',str)):
        str=re.sub(r'\w+\'', '', str) 
    latokens.append(str)
tokens = [w for w in latokens if w not in stopworddic]

In [21]:
tokenized_data=[]
for doc in obj :
    if len(doc.body_fr)>2 :
        text=doc.body_fr
        text=nltk.word_tokenize(text)
        text = [w.lower() for w in text if re.search('[a-zA-ZàâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+',w)]
        latokens=[]
        for str in text:
            if(re.search(r'\w+\'',str)):
                str=re.sub(r'\w+\'', '', str) 
            latokens.append(str)
        tokens = [w for w in latokens if w not in stopworddic]
        text = [lemma(w) for w in tokens]
        tokenized_data.append(text)

In [22]:
print(tokenized_data[0])

['groupe', 'défense', 'vouloir', 'accompagner', 'jeune', 'pousse', 'français', 'haut', 'potentiel', 'ici', 'banque', 'compléter', 'offre', 'travailler', 'insérer', 'écosystème', 'développement', 'entreprise', 'société', 'général', 'mettre', 'quête', 'nouveau', 'pépite', 'banque', 'défense', 'servir', 'déjà', 'start-up', 'dont', 'juger', 'particulièrement', 'prometteuses', 'annoncer', 'mardi', 'souhaiter', 'séduire', 'supplémentaires', 'horizon', 'ensemble', 'territoire', 'jargon', 'banque', 'pépite', 'entreprise', 'déjà', 'réussir', 'première', 'levée', 'fond', 'moins', 'euro', 'accompagner', 'incubateur', 'phare', 'fond', 'capital-risque', 'créer', 'idéalement', 'entrepreneur', 'coup', 'essai', 'pouvoir', 'agir', 'entreprise', 'déjà', 'suivre', 'banque', 'doit', 'encore', 'grandir', 'comme', 'recrutement', 'externe', 'parvenir', 'banque', 'peaufiner', 'arsenal', 'signer', 'lundi', 'prochain', 'partenariat', 'bpifrance', 'vue', 'notamment', 'faire', 'dialoguer', 'conseiller', 'réseau',

In [23]:
from gensim import models,corpora
dictionary = corpora.Dictionary(tokenized_data)
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [24]:
lda_model = models.LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lsi_model = models.LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [25]:
print("LDA Model:")
 
for idx in range(10):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(10):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.007*"entreprise" + 0.005*"service" + 0.005*"nouveau" + 0.005*"plus" + 0.005*"pouvoir" + 0.004*"client" + 0.004*"france" + 0.004*"permettre" + 0.004*"projet" + 0.003*"donnée"
Topic #1: 0.012*"plus" + 0.007*"pouvoir" + 0.006*"entreprise" + 0.005*"faire" + 0.005*"aussi" + 0.004*"comme" + 0.004*"être" + 0.004*"grand" + 0.004*"start-up" + 0.004*"an"
Topic #2: 0.011*"the" + 0.008*"plus" + 0.007*"of" + 0.006*"and" + 0.005*"to" + 0.005*"luxembourg" + 0.004*"in" + 0.003*"pouvoir" + 0.003*"an" + 0.003*"euro"
Topic #3: 0.009*"projet" + 0.008*"entreprise" + 0.006*"plus" + 0.006*"espace" + 0.006*"pouvoir" + 0.005*"euro" + 0.004*"nouveau" + 0.004*"aussi" + 0.003*"travail" + 0.003*"start-up"
Topic #4: 0.007*"plus" + 0.006*"entreprise" + 0.004*"pouvoir" + 0.003*"https" + 0.003*"start-up" + 0.003*"comme" + 0.003*"e=" + 0.003*"grand" + 0.003*"c=sfymryylsugfxnyao2svzg" + 0.003*"d=dwmgaq"
Topic #5: 0.007*"france" + 0.007*"plus" + 0.006*"entreprise" + 0.005*"pouvoir" + 0.004*"pari" +