# BDM550 - Final Project

- Name: Ran Arino
- Student ID: 153073200
- Email: rarino@myseneca.ca
- Course: Predictive Analytics
- Course ID: BDM550NAA.05359.2237
- Professor: Dr. Elnaz Delpisheh

In [138]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

from joblib import dump

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.lda_model
pyLDAvis.enable_notebook()


### Part 1

In [2]:
"""
# load the dataset
a1 = pd.read_csv('articles1.csv')
a2 = pd.read_csv('articles2.csv')
a3 = pd.read_csv('articles3.csv')

# Define function to load and clean text data
#  this is similar function to the one that I used in Workshop05
def clean_texts(raw_texts):
    # define result
    result = []

    # set of stopwords
    stop_words = set(stopwords.words('english'))
    # set porter stemmers
    porter = nltk.PorterStemmer()

    # traversing all sentences
    for sent in raw_texts:
        # tokenize
        tokens = word_tokenize(sent)
        # defined cleaned sentence
        clean_sent = ''
        # cleaning each sentence
        for w in tokens:
            # if 'w' is one of punctuations, skip to the next word
            if w in string.punctuation:
                continue
            # if 'w' is one of stop words, skip to the next word
            if w.lower() in stop_words:
                continue
            # add stemmed word to clean_sent
            clean_sent += porter.stem(w.lower()) + ' '

        # add clean_sent to result (make sure that the last item is always blank)
        result += [clean_sent[:-1]]

    return result

# cleaning the text & put everything into one variable
texts = []

# traversing all files
for df_sub in [a1, a2, a3]:
    texts += clean_texts(df_sub['content'])
    
# store the cleaned data as new df
titles = list(a1['title']) + list(a2['title']) + list(a3['title'])
dates = list(a1['date']) + list(a2['date']) + list(a3['date'])

df = pd.DataFrame({'title': titles, 'date': dates, 'content': texts})
df = df.dropna(subset=['content'])
df.head()
"""

Unnamed: 0,title,date,content
0,House Republicans Fret About Winning Their Hea...,2016-12-31,washington — congression republican new fear c...
1,Rift Between Officers and Residents as Killing...,2017-06-19,bullet shell get count blood dri votiv candl b...
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",2017-01-06,walt disney ’ “ bambi ” open 1942 critic prais...
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",2017-04-10,death may great equal ’ necessarili evenhand f...
4,Kim Jong-un Says North Korea Is Preparing to T...,2017-01-02,seoul south korea — north korea ’ leader kim s...


In [147]:
# short cut if 'df.csv' has already been loaded
df = pd.read_csv('df.csv')
#df = df.dropna(subset=['content'])
len(df)

142535

: 

In [16]:
# vectorizing the texts
vect = CountVectorizer()
vect_texts = vect.fit_transform(df['content'])

print(vect_texts.shape)
print(vect_texts[:5].toarray())

(142535, 260539)
[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [17]:
# train the model (20 topics)
lda = LatentDirichletAllocation(n_components=20)
results = lda.fit_transform(vect_texts)

# get topic-word distribution
topic_word_dist = lda.components_
# get document-word distribution
doc_word_dist = lda.transform(vect_texts)

In [139]:
# save two models
dump(lda, 'lda_model.joblib')
dump(vect, 'vectorizer.joblib')

['vectorizer.joblib']

In [62]:
# storing the topic-word distributions
with open('topic_word_dist.txt', 'w', encoding='utf-8') as f:
    # the first line shows vochaburaries
    f.write(" ".join(list(vect.get_feature_names_out())) + "\n")
    # the rest of lines shows distributions
    for i in range(len(topic_word_dist)):
        f.write(" ".join(list([str(i) for i in topic_word_dist[i]])) + "\n")

In [127]:
# storing the document-word distribution
topic_num = len(doc_word_dist[0])
with open('doc_word_dist.txt', 'w', encoding='utf-8') as f:
    # the first line shows vochaburaries
    f.write(" ".join([f'topic{n}' for n in range(1, topic_num+1)]) + "\n")
    # the rest of lines shows distributions
    for i in range(len(doc_word_dist)):
        f.write(" ".join(list([str(i) for i in doc_word_dist[i]])) + "\n")

In [37]:
# show visualization
pyLDAvis.lda_model.prepare(lda, vect_texts, vect)