In [1]:
import json
import numpy as np
from pprint import pprint
import requests
import nltk.data
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

In [2]:
class JSONObject:
     def __init__(self, d):
         self.__dict__ = d

with open("articles_Accuracy.json", "r") as read_file:
    data = read_file.read()
    obj = json.loads(data,object_hook=JSONObject)      

In [3]:
from nltk.corpus import stopwords
stopworddic = set(stopwords.words('french'))
article=['a','au','aux','un','une','le','la','les','de','des','ce','cet','cette','ces','son','sa','ses','leur','leurs','mon','ma','mes','ton','ta','tes','notre','notres','votre','votres']
pronom=['je','tu','il','elle','nous','vous','ils','elles','on','y','en','se']
coordination=['mais','ou','et','donc','or','ni','car']
v=['aller','vais','vas','va','allez','allions','vont']
stopworddic.update(set(article),set(pronom),set(coordination),set(v))

In [4]:
lemmatizer = FrenchLefffLemmatizer()
def lemma(word):
    a=lemmatizer.lemmatize(word,'all')
    b=[x[1] for x in a]
    if 'nc' in b:
        return lemmatizer.lemmatize(word,'n')
    if 'v' in b:
        return lemmatizer.lemmatize(word,'v')
    return lemmatizer.lemmatize(word)

In [5]:
import re
rawtokens=nltk.word_tokenize(obj[0].body_fr)
metokens = [w.lower() for w in rawtokens if re.search('[a-zA-ZàâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+',w)]
latokens=[]
for str in metokens:
    if(re.search(r'\w+\'',str)):
        str=re.sub(r'\w+\'', '', str) 
    latokens.append(str)
tokens = [w for w in latokens if w not in stopworddic]

In [6]:
worduselessdic=set(['plus','the','of','aussi','être','faire','comme','pouvoir','and','to','tout','in' ])

In [7]:
tokenized_data=[]
for doc in obj :
    if len(doc.body_fr)>2 :
        text=doc.body_fr
        text=nltk.word_tokenize(text)
        text = [w.lower() for w in text if re.search('[a-zA-ZàâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+',w)]
        latokens=[]
        for str in text:
            if(re.search(r'\w+\'',str)):
                str=re.sub(r'\w+\'', '', str) 
            latokens.append(str)
        tokens = [w for w in latokens if w not in stopworddic]
        text = [lemma(w) for w in tokens]
        text = [w for w in text if w not in worduselessdic]
        tokenized_data.append(text)

In [8]:
print(tokenized_data[0])

['groupe', 'défense', 'vouloir', 'accompagner', 'jeune', 'pousse', 'français', 'haut', 'potentiel', 'ici', 'banque', 'compléter', 'offre', 'travailler', 'insérer', 'écosystème', 'développement', 'entreprise', 'société', 'général', 'mettre', 'quête', 'nouveau', 'pépite', 'banque', 'défense', 'servir', 'déjà', 'start-up', 'dont', 'juger', 'particulièrement', 'prometteuses', 'annoncer', 'mardi', 'souhaiter', 'séduire', 'supplémentaires', 'horizon', 'ensemble', 'territoire', 'jargon', 'banque', 'pépite', 'entreprise', 'déjà', 'réussir', 'première', 'levée', 'fond', 'moins', 'euro', 'accompagner', 'incubateur', 'phare', 'fond', 'capital-risque', 'créer', 'idéalement', 'entrepreneur', 'coup', 'essai', 'agir', 'entreprise', 'déjà', 'suivre', 'banque', 'doit', 'encore', 'grandir', 'recrutement', 'externe', 'parvenir', 'banque', 'peaufiner', 'arsenal', 'signer', 'lundi', 'prochain', 'partenariat', 'bpifrance', 'vue', 'notamment', 'dialoguer', 'conseiller', 'réseau', 'bancaire', 'chargé', 'affai

In [9]:
from gensim import models,corpora
dictionary = corpora.Dictionary(tokenized_data)
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]



In [10]:
lda_model = models.LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lsi_model = models.LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [11]:
print("LDA Model:")
 
for idx in range(10):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(10):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.006*"entreprise" + 0.004*"nouveau" + 0.004*"projet" + 0.004*"deux" + 0.004*"espace" + 0.003*"an" + 0.003*"euro" + 0.003*"site" + 0.003*"grand" + 0.003*"bâtiment"
Topic #1: 0.006*"entreprise" + 0.005*"projet" + 0.004*"nouveau" + 0.004*"euro" + 0.003*"commune" + 0.003*"ville" + 0.003*"an" + 0.003*"dernier" + 0.003*"pari" + 0.003*"développement"
Topic #2: 0.005*"entreprise" + 0.005*"start-up" + 0.004*"nouveau" + 0.004*"an" + 0.004*"groupe" + 0.004*"service" + 0.004*"projet" + 0.004*"pari" + 0.003*"permettre" + 0.003*"client"
Topic #3: 0.006*"entreprise" + 0.005*"an" + 0.004*"france" + 0.004*"projet" + 0.004*"école" + 0.004*"jeune" + 0.004*"fait" + 0.003*"start-up" + 0.003*"nouveau" + 0.003*"deux"
Topic #4: 0.006*"luxembourg" + 0.005*"entreprise" + 0.003*"heure" + 0.003*"contact" + 0.003*"européen" + 0.003*"ministre" + 0.003*"is" + 0.003*"for" + 0.003*"nouveau" + 0.003*"ainsi"
Topic #5: 0.009*"entreprise" + 0.007*"heure" + 0.007*"bordeaux" + 0.004*"an" + 0.004*"nouve