In [1]:
### Running LDA analysis on summarized (condensed) text generated from articles.

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(1729)
import nltk
import pandas as pd
from nltk.corpus import stopwords

In [3]:
ORIGINAL_TEXT = ["In fact, Tesla CEO Elon Musk said in April that Autopilot can help reduce accidents by as much as 50%.\n\nBut just like any system, it's not perfect",
 '\n\nHowever, it should be noted that these sensors can be thrown off by things like debris covering them',
 "These sensors help the car understand its environment so that it can safely steer itself in most highway situations.\nThe hardware that makes up Tesla's self-driving system includes a forward radar, a forward-looking camera, a high-precision digitally-controlled electric assist braking system, and 12 long-range ultrasonic sensors placed around the car",
 "\n\nOn Thursday, regulators revealed an investigation into a possible tie between Tesla's Autopilot system and a fatal accident.\n\nWhile few details about the collision have been revealed, Tesla has said that the car was in Autopilot mode when the car crashed.\n\nHere's a closer look at how Autopilot works to help you better understand how it should be used.\nTesla's Autopilot system is made up of multiple sensors placed all around the car",
 '\n\n\nThese ultrasonic sensors are strategically placed around the car so that they can sense 16 feet around the car in every direction, at any speed.\nThe senors enable the vehicle to sense when something is too close and gauge the appropriate distance so that it can do things like safely change lanes',
 'And it requires a human to pay attention at all times',
 '\n\nThe radar enables detection of cars and other moving objects.\nThe forward-facing camera is located on the top windshield',
 "It's been shown time and time again to help people avoid accidents"]

In [53]:
def pre_process_text(ORIGINAL_TEXT):
    """Polishes text"""
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    STOPWORDS.add("like")
    STOPWORDS.add("said")
    STOPWORDS.add("forward")
    STOPWORDS.add("time")
    frp = []
    for i, c in enumerate(ORIGINAL_TEXT):
        reg = c.lower()
        reg = ' '.join(reg)
        reg = ' '.join([word for word in c.split() if word not in STOPWORDS])
        reg = re.sub('[^a-zA-Z]', ' ', reg)
        reg = re.sub(r'\s+', ' ', reg)
        frp.append(reg)
    return frp

In [54]:
POLISHED_TEXT = pre_process_text(ORIGINAL_TEXT)

In [55]:
POLISHED_TEXT

['In fact Tesla CEO Elon Musk April Autopilot help reduce accidents much But system perfect',
 'However noted sensors thrown things debris covering',
 'These sensors help car understand environment safely steer highway situations The hardware makes Tesla s self driving system includes radar forward looking camera high precision digitally controlled electric assist braking system long range ultrasonic sensors placed around car',
 'On Thursday regulators revealed investigation possible tie Tesla s Autopilot system fatal accident While details collision revealed Tesla car Autopilot mode car crashed Here s closer look Autopilot works help better understand used Tesla s Autopilot system made multiple sensors placed around car',
 'These ultrasonic sensors strategically placed around car sense feet around car every direction speed The senors enable vehicle sense something close gauge appropriate distance things safely change lanes',
 'And requires human pay attention times',
 'The radar enabl

In [56]:
#Incorporating stemming instead of lemmatization because of performance and speed.

In [57]:
stemmer = SnowballStemmer('english')

In [58]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [59]:
lemmantized = list(pd.Series(POLISHED_TEXT).map(preprocess))

In [60]:
dictionary = gensim.corpora.Dictionary(lemmantized)
dictionary.filter_tokens()

In [61]:
def generate_frequencies(sentence):
    """
    Generates Frequencies and Occurances of words from a sentence:
    1. Parameters : sentence (lemmantized version of the sentence, in list type).
    2. Returns : 
        A. TDM (pandas Series object)
        B. DataFrame (pandas TDM representation)
    """
    occurance = {}
    frequency = {}
    for i, word in enumerate(sentence):
        if word not in frequency.keys():
            frequency[word] = 1
        else:
            frequency[word] += 1
    max_word_frequency = max(frequency.values())
    for word in frequency.keys():
        occurance[word] = frequency[word] / max_word_frequency
    df = pd.DataFrame(data=[list(frequency.keys()), list(frequency.values()), list(occurance.values())]).T
    df.columns = ['Word', 'Occurance', 'Frequency']
    return df, frequency, occurance

In [68]:
def generate_tf_idf(paragraph):
    tf_idf = []
    for i, lem in enumerate(paragraph):
        pd_df, frequency_words, occurance_words = generate_frequencies(lem)
        temp = []
        for occur, freq in zip(frequency_words.values(), occurance_words.values()):
            temp.append(freq * np.log10(len(ORIGINAL_TEXT)/occur))
        tf_idf.append(temp)
    return tf_idf

In [63]:
### TD-IDF generation

In [64]:
tf_idf = generate_tf_idf(lemmantized)

In [70]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
lemmy_BOW = [dictionary.doc2bow(text) for text in lemmantized]

In [72]:
from gensim import corpora, models

tfidf = models.TfidfModel(lemmy_BOW)

In [75]:
corpus_tfidf = tfidf[lemmy_BOW]

In [86]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=3)

In [87]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.025*"autopilot" + 0.021*"revealed" + 0.018*"tesla" + 0.015*"mode" + 0.015*"investigation" + 0.015*"fatal" + 0.015*"regulators" + 0.015*"multiple" + 0.015*"collision" + 0.015*"crashed"
Topic: 1 
Word: 0.021*"avoid" + 0.021*"people" + 0.021*"shown" + 0.020*"attention" + 0.020*"times" + 0.020*"human" + 0.020*"requires" + 0.020*"thrown" + 0.019*"noted" + 0.019*"debris"
Topic: 2 
Word: 0.020*"sense" + 0.018*"april" + 0.018*"fact" + 0.018*"elon" + 0.018*"musk" + 0.018*"reduce" + 0.018*"perfect" + 0.017*"windshield" + 0.017*"moving" + 0.017*"objects"


In [66]:
lda_model = gensim.models.LdaMulticore(lemmy_BOW, minimum_probability=0.2, num_topics=5, id2word=dictionary, passes=3, workers=3)

In [67]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.041*"detection" + 0.040*"facing" + 0.040*"located" + 0.040*"windshield" + 0.040*"objects" + 0.040*"cars" + 0.040*"enables" + 0.040*"moving" + 0.040*"forward" + 0.040*"camera"
Topic: 1 
Words: 0.054*"sensors" + 0.037*"help" + 0.021*"radar" + 0.021*"camera" + 0.021*"forward" + 0.020*"ultrasonic" + 0.020*"safely" + 0.020*"includes" + 0.020*"placed" + 0.020*"driving"
Topic: 2 
Words: 0.092*"autopilot" + 0.070*"tesla" + 0.048*"revealed" + 0.026*"details" + 0.026*"better" + 0.026*"possible" + 0.026*"mode" + 0.026*"look" + 0.026*"placed" + 0.026*"regulators"
Topic: 3 
Words: 0.053*"requires" + 0.053*"attention" + 0.053*"times" + 0.053*"human" + 0.013*"sense" + 0.012*"senors" + 0.012*"change" + 0.012*"direction" + 0.012*"appropriate" + 0.012*"speed"
Topic: 4 
Words: 0.046*"sense" + 0.026*"accidents" + 0.026*"fact" + 0.026*"reduce" + 0.026*"april" + 0.026*"perfect" + 0.026*"elon" + 0.026*"musk" + 0.026*"help" + 0.026*"tesla"
