In [1]:
### Running LDA analysis on summarized (condensed) text generated from articles.

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(1729)
import nltk
import pandas as pd
from nltk.corpus import stopwords

In [23]:
ORIGINAL_TEXT = ['\nFrench startup BlaBlaCar has announced that the company’s revenue grew by 71 percent in 2019 compared to 2018',
 'Ouibus is now called BlaBlaBus',
 'The big difference between 2019 and 2018 is that BlaBlaCar diversified its activity by offering bus rides as well as bus ticketing in some markets.\nBlaBlaCar is still mostly known for its long-distance ride-sharing marketplace',
 'On the other side of the marketplace, if you plan on driving across the country, you can list your ride on the platform to find passengers so that you don’t have to pay for gas and highway tolls by yourself.\nIn November 2018, the company acquired Ouibus to become a marketplace for road travel, whether it’s by bus or by car',
 'If you’re going from one city to another, you can find a car with an empty seat and book a ride in that car']

In [24]:
def pre_process_text(ORIGINAL_TEXT):
    """Polishes text"""
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    STOPWORDS.add("like")
    STOPWORDS.add("said")
    STOPWORDS.add("forward")
    STOPWORDS.add("time")
    frp = []
    for i, c in enumerate(ORIGINAL_TEXT):
        reg = c.lower()
        reg = ' '.join(reg)
        reg = ' '.join([word for word in c.split() if word not in STOPWORDS])
        reg = re.sub('[^a-zA-Z]', ' ', reg)
        reg = re.sub(r'\s+', ' ', reg)
        frp.append(reg)
    return frp

In [25]:
POLISHED_TEXT = pre_process_text(ORIGINAL_TEXT)

In [26]:
POLISHED_TEXT

['French startup BlaBlaCar announced company s revenue grew percent compared ',
 'Ouibus called BlaBlaBus',
 'The big difference BlaBlaCar diversified activity offering bus rides well bus ticketing markets BlaBlaCar still mostly known long distance ride sharing marketplace',
 'On side marketplace plan driving across country list ride platform find passengers don t pay gas highway tolls yourself In November company acquired Ouibus become marketplace road travel whether it s bus car',
 'If you re going one city another find car empty seat book ride car']

In [27]:
#Incorporating stemming instead of lemmatization because of performance and speed.

In [28]:
stemmer = SnowballStemmer('english')

In [29]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [30]:
lemmantized = list(pd.Series(POLISHED_TEXT).map(preprocess))

In [31]:
dictionary = gensim.corpora.Dictionary(lemmantized)
dictionary.filter_tokens()

In [32]:
def generate_frequencies(sentence):
    """
    Generates Frequencies and Occurances of words from a sentence:
    1. Parameters : sentence (lemmantized version of the sentence, in list type).
    2. Returns : 
        A. TDM (pandas Series object)
        B. DataFrame (pandas TDM representation)
    """
    occurance = {}
    frequency = {}
    for i, word in enumerate(sentence):
        if word not in frequency.keys():
            frequency[word] = 1
        else:
            frequency[word] += 1
    max_word_frequency = max(frequency.values())
    for word in frequency.keys():
        occurance[word] = frequency[word] / max_word_frequency
    df = pd.DataFrame(data=[list(frequency.keys()), list(frequency.values()), list(occurance.values())]).T
    df.columns = ['Word', 'Occurance', 'Frequency']
    return df, frequency, occurance

In [33]:
def generate_tf_idf(paragraph):
    tf_idf = []
    for i, lem in enumerate(paragraph):
        pd_df, frequency_words, occurance_words = generate_frequencies(lem)
        temp = []
        for occur, freq in zip(frequency_words.values(), occurance_words.values()):
            temp.append(freq * np.log10(len(ORIGINAL_TEXT)/occur))
        tf_idf.append(temp)
    return tf_idf

In [34]:
### TD-IDF generation

In [35]:
tf_idf = generate_tf_idf(lemmantized)

In [36]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
lemmy_BOW = [dictionary.doc2bow(text) for text in lemmantized]

In [37]:
from gensim import corpora, models

tfidf = models.TfidfModel(lemmy_BOW)

In [38]:
corpus_tfidf = tfidf[lemmy_BOW]

In [39]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=3)

In [40]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.035*"blablacar" + 0.034*"rides" + 0.034*"ticketing" + 0.034*"activity" + 0.034*"known" + 0.033*"distance" + 0.033*"markets" + 0.033*"offering" + 0.033*"sharing" + 0.033*"diversified"
Topic: 1 
Word: 0.036*"marketplace" + 0.033*"country" + 0.033*"highway" + 0.033*"platform" + 0.033*"list" + 0.033*"acquired" + 0.033*"november" + 0.032*"plan" + 0.032*"travel" + 0.032*"driving"
Topic: 2 
Word: 0.047*"called" + 0.047*"blablabus" + 0.040*"city" + 0.040*"book" + 0.040*"going" + 0.040*"seat" + 0.034*"startup" + 0.034*"revenue" + 0.034*"percent" + 0.034*"compared"


In [41]:
lda_model = gensim.models.LdaMulticore(lemmy_BOW, minimum_probability=0.2, num_topics=5, id2word=dictionary, passes=3, workers=3)

In [42]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.095*"blablacar" + 0.052*"ride" + 0.052*"diversified" + 0.052*"offering" + 0.052*"markets" + 0.052*"difference" + 0.052*"long" + 0.052*"sharing" + 0.052*"known" + 0.052*"distance"
Topic: 1 
Words: 0.087*"marketplace" + 0.048*"ouibus" + 0.048*"ride" + 0.048*"company" + 0.048*"tolls" + 0.048*"november" + 0.048*"highway" + 0.048*"passengers" + 0.048*"acquired" + 0.048*"road"
Topic: 2 
Words: 0.091*"book" + 0.091*"ride" + 0.091*"seat" + 0.091*"city" + 0.091*"going" + 0.015*"blablabus" + 0.015*"ouibus" + 0.015*"called" + 0.015*"marketplace" + 0.015*"blablacar"
Topic: 3 
Words: 0.024*"ride" + 0.024*"called" + 0.024*"blablabus" + 0.024*"ouibus" + 0.024*"blablacar" + 0.024*"going" + 0.024*"city" + 0.024*"company" + 0.024*"seat" + 0.024*"marketplace"
Topic: 4 
Words: 0.059*"startup" + 0.059*"announced" + 0.059*"revenue" + 0.059*"french" + 0.059*"grew" + 0.059*"compared" + 0.059*"company" + 0.059*"percent" + 0.059*"blablabus" + 0.059*"called"
