In [1]:
# from nltk.tokenize import TweetTokenizer
# tknzr = TweetTokenizer()
# s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
# tknzr.tokenize(s0)
# ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

In [2]:
# basic
import pandas as pd
import numpy as np 
import json 
import glob 

In [3]:
# gensim 
# Topic modeling core library 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
# Spacy 
# mostly for tokenizing 
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction import text

In [5]:
# pyLDAvis
# LDA visualizations
import pyLDAvis
import pyLDAvis.gensim_models

import warnings

# supress warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)

In [6]:
#https://www.youtube.com/watch?v=TKjjlp5_r7o

# Read in Data 

____

Read in dataframe

In [7]:
posts_df = pd.read_csv('./data/all_posts.csv')

Select just the text.

In [8]:
posts = posts_df['post']

# Stop Words and Lemmatizing

___

Spacy has pwerful pretrained pipelines that we will utilize for tokenizing our texts, and accessing word feature like parts of speech. We nee to load in the model below.

In [9]:
# !python -m spacy download en_core_web_sm

## Stopwords

We will revisit this section iterratively in order to remove words that cloud the decriptive nature of our topic model output, and those which confuse the model on disparate topics.

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/t0ad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
stopwords = ['can','will','so','now','see','white','brown']

In [12]:
stopwords.append(text.ENGLISH_STOP_WORDS)

In [13]:
# using spacy pre-trained pipelines for tokenization
nlp = spacy.load('en_core_web_sm', diasble = ['parser','ner']) 

## Lemma_

In [14]:
def lemmatization(texts, allowed_postages=['NOUN','ADJ','VERB','ADV']):
    
    '''
    allowed_postage : the parts of speach we want to keep [DEFAULT: 'NOUN','ADJ','VERB','ADV'] 
    '''
    
    # load in spacy sm web model 
    nlp = spacy.load('en_core_web_sm', diasble = ['parser','ner']) # computaltionally expensize aspects 
    texts_out = [] # output
    
    # for each post in the corpus
    # iterate over texts
    for text in texts:  
        # creates spacy doc object containing vectorized contextual information like Parts of Speech (pos) 
        doc = nlp(text)
        # list for holding lemmatized tokens
        new_text = []
        # iterate over each token
        for token in doc:
            # only keep the desired pos
            if token.pos_ in allowed_postages:
                if str(token) not in stopwords:
                    # reducing model complexity by reducing tokens to lemma_ 
                    new_text.append(token.lemma_)   
                    # print(token.lemma_)

        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

In [15]:
lemmatized_posts = lemmatization(posts)

## Tokens

In [16]:
# in order to prepare the data for LDA topic models
# Get work tokens from lemmatized text 
# lower case and remove accents 

In [17]:
def gen_words(texts):
    final = [] 
    for text in texts:
        # Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
        new = gensim.utils.simple_preprocess(text , deacc=True)# – Remove accent marks from tokens using 
        final.append(new)
    return(final)

In [18]:
data_words = gen_words(lemmatized_posts)

# Bigrams &  Trigrams 

___

We attempt to capture some of the more important word pairings with bigrams and trigrams. 

In [19]:
# https://www.youtube.com/watch?v=UEn3xHNBXJU
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#9createbigramandtrigrammodels

In [20]:
def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]

In [22]:
# BIGRAMS AND TRIGRAMS

# Build the bigram and trigram models
bigram_model = gensim.models.Phrases(data_words, min_count = 3, threshold = 50 )# min freq for a coupling to be a bigram ## thresh = num of bigrams allowes
# of the bigrams, are is their overlap in the rest of our words for a trigram?
trigram_model = gensim.models.Phrases(bigram_model[data_words], threshold = 50 )

# Faster way to get a sentence clubbed as a trigram/bigram

# fit bigram model 
bigram = gensim.models.phrases.Phraser(bigram_model)
trigram = gensim.models.phrases.Phraser(trigram_model)

# instantia
data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print(data_bigrams_trigrams)

# Tf-IDF REMOVAL

___

We arn't going to want to move forward with this kind of word removal until we have done more manual inspection and stop word removal. The brevity of our texts may demand that we keep as many words as possible, and lossing frequently occurring words seminal to the focus of the study .i.e homelessness would prevent us from identifying a posts relevancy overall at this time. for example. we still do not currently know if all of our posts pertain to homelessness, or if they include many discussions of cats up tree's. Untill we have done more cleaning, we will want our topic model to to isolate irrelavant posts, removing "homeless" from all of the posts would make it rather difficult to do this. So it is a step better saved for latter.

In [23]:
from gensim.models import TfidfModel

# create word dictionary 
id2word = corpora.Dictionary(data_bigrams_trigrams)
# just to make it simpler going forward 
texts = data_bigrams_trigrams
# convert all of our texts into a bag of words
corpus = [id2word.doc2bow(text) for text in texts]

print ( corpus[0][0:20])

# instantiate tfidf model 
tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tfidf = []


for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] 
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value] 
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # the words with tf-idf score will be missing
    
    new_bow= [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow
    
corpus

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1)]


[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 2),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1)],
 [(39, 1),
  (40, 1),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 2),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1)],
 [(20, 2),
  (23, 1),
  (29, 2),
  (41, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 5),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 3),
  (71, 1),
  (72, 2),
  (73, 2),
  (74, 1),
  (75, 1)],
 [(4, 1),
  (12, 1),
  (20, 1),
  (29, 1),
  (50, 1),
  (52, 1),
  (58, 1),
  (76, 2),
  (77, 2),
  (78, 1),
  (79, 

# Latent Derichlet Allocation 

___

In [24]:
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                            id2word= id2word, 
                                            num_topics = 6,
                                            random_state = 100,
                                            update_every = 1,
                                            chunksize = 250,
                                            alpha = 'auto')

## Visualizing the data 

In [25]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds', R= 15)

In [26]:
vis

In [27]:
lda_model.topics_

AttributeError: 'LdaModel' object has no attribute 'topics_'