In [1]:
### Running LDA analysis on summarized (condensed) text generated from articles.

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(1729)
import nltk
import pandas as pd
from nltk.corpus import stopwords

In [3]:
ORIGINAL_TEXT = ['\nFrench startup BlaBlaCar has announced that the company’s revenue grew by 71 percent in 2019 compared to 2018',
 'Ouibus is now called BlaBlaBus',
 'The big difference between 2019 and 2018 is that BlaBlaCar diversified its activity by offering bus rides as well as bus ticketing in some markets.\nBlaBlaCar is still mostly known for its long-distance ride-sharing marketplace',
 'On the other side of the marketplace, if you plan on driving across the country, you can list your ride on the platform to find passengers so that you don’t have to pay for gas and highway tolls by yourself.\nIn November 2018, the company acquired Ouibus to become a marketplace for road travel, whether it’s by bus or by car',
 'If you’re going from one city to another, you can find a car with an empty seat and book a ride in that car']

In [4]:
def pre_process_text(ORIGINAL_TEXT):
    """Polishes text"""
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    STOPWORDS.add("like")
    STOPWORDS.add("said")
    STOPWORDS.add("forward")
    STOPWORDS.add("time")
    frp = []
    for i, c in enumerate(ORIGINAL_TEXT):
        reg = c.lower()
        reg = ' '.join(reg)
        reg = ' '.join([word for word in c.split() if word not in STOPWORDS])
        reg = re.sub('[^a-zA-Z]', ' ', reg)
        reg = re.sub(r'\s+', ' ', reg)
        frp.append(reg)
    return frp

In [5]:
POLISHED_TEXT = pre_process_text(ORIGINAL_TEXT)

In [6]:
POLISHED_TEXT

['With experience designing previous apps extensive help APEX Entrepreneurship Center Wani decided develop app HitchHiqe aid students search long distance carpooling rides Blacksburg',
 'Wani spent around hours working HitchHiqe said There lot challenges you re building something idea white board sometimes feels doesn t work',
 'Wani first came contact APEX kickstarter event struck conversation presenter possibilities virtual reality',
 ' The algorithms include screening drivers anyone driver HitchHiqe unless registered vehicle Wani',
 'Production app development began May ',
 'It Wani started path app development eventually guided toward recent app HitchHiqe',
 'The app saves students around minutes process',
 'The response MIT overwhelmingly positive',
 ' In order register HitchHiqe need edu account as well There also safety concerns Facebook group creation app hopes address We actually got email one moderators VT parent s group Facebook isn t secure kids would endorse on our platfor

In [7]:
#Incorporating stemming instead of lemmatization because of performance and speed.

In [8]:
stemmer = SnowballStemmer('english')

In [9]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [10]:
lemmantized = list(pd.Series(POLISHED_TEXT).map(preprocess))

In [11]:
dictionary = gensim.corpora.Dictionary(lemmantized)
dictionary.filter_tokens()

In [12]:
def generate_frequencies(sentence):
    """
    Generates Frequencies and Occurances of words from a sentence:
    1. Parameters : sentence (lemmantized version of the sentence, in list type).
    2. Returns : 
        A. TDM (pandas Series object)
        B. DataFrame (pandas TDM representation)
    """
    occurance = {}
    frequency = {}
    for i, word in enumerate(sentence):
        if word not in frequency.keys():
            frequency[word] = 1
        else:
            frequency[word] += 1
    max_word_frequency = max(frequency.values())
    for word in frequency.keys():
        occurance[word] = frequency[word] / max_word_frequency
    df = pd.DataFrame(data=[list(frequency.keys()), list(frequency.values()), list(occurance.values())]).T
    df.columns = ['Word', 'Occurance', 'Frequency']
    return df, frequency, occurance

In [13]:
def generate_tf_idf(paragraph):
    tf_idf = []
    for i, lem in enumerate(paragraph):
        pd_df, frequency_words, occurance_words = generate_frequencies(lem)
        temp = []
        for occur, freq in zip(frequency_words.values(), occurance_words.values()):
            temp.append(freq * np.log10(len(ORIGINAL_TEXT)/occur))
        tf_idf.append(temp)
    return tf_idf

In [14]:
### TD-IDF generation

In [15]:
tf_idf = generate_tf_idf(lemmantized)

In [16]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
lemmy_BOW = [dictionary.doc2bow(text) for text in lemmantized]

In [17]:
from gensim import corpora, models

tfidf = models.TfidfModel(lemmy_BOW)

In [18]:
corpus_tfidf = tfidf[lemmy_BOW]

In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=3)

In [20]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.013*"minutes" + 0.013*"saves" + 0.012*"built" + 0.012*"weeks" + 0.012*"students" + 0.011*"semester" + 0.011*"pitching" + 0.011*"hackathon" + 0.011*"hitchhiqe" + 0.011*"version"
Topic: 1 
Word: 0.014*"began" + 0.013*"overwhelmingly" + 0.013*"people" + 0.013*"response" + 0.012*"spent" + 0.012*"touch" + 0.012*"talked" + 0.012*"production" + 0.012*"helpful" + 0.012*"development"
Topic: 2 
Word: 0.013*"gurrapu" + 0.012*"drivers" + 0.012*"safety" + 0.011*"facebook" + 0.011*"funding" + 0.011*"attribute" + 0.011*"success" + 0.011*"marketing" + 0.010*"registered" + 0.010*"algorithms"


In [21]:
lda_model = gensim.models.LdaMulticore(lemmy_BOW, minimum_probability=0.2, num_topics=5, id2word=dictionary, passes=3, workers=3)

In [22]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.031*"tech" + 0.031*"virginia" + 0.031*"process" + 0.030*"wani" + 0.017*"success" + 0.017*"funding" + 0.017*"attribute" + 0.017*"marketing" + 0.017*"center" + 0.017*"help"
Topic: 1 
Words: 0.073*"wani" + 0.039*"hitchhiqe" + 0.022*"people" + 0.021*"development" + 0.021*"path" + 0.021*"recent" + 0.020*"started" + 0.020*"idea" + 0.020*"weeks" + 0.020*"semester"
Topic: 2 
Words: 0.044*"wani" + 0.044*"hitchhiqe" + 0.023*"work" + 0.022*"apex" + 0.013*"carpooling" + 0.013*"search" + 0.013*"spent" + 0.013*"help" + 0.013*"idea" + 0.013*"center"
Topic: 3 
Words: 0.039*"safety" + 0.038*"facebook" + 0.027*"positive" + 0.027*"precautions" + 0.026*"group" + 0.015*"response" + 0.015*"overwhelmingly" + 0.015*"wani" + 0.015*"drivers" + 0.015*"ensure"
Topic: 4 
Words: 0.046*"facebook" + 0.033*"gurrapu" + 0.032*"students" + 0.018*"production" + 0.018*"received" + 0.018*"develop" + 0.018*"expressed" + 0.018*"notification" + 0.018*"helping" + 0.018*"early"
