In [5]:
%run ../paths.py

In [6]:
import pandas as pd
import numpy as np
import os
import re
import operator
import gc
import time
import matplotlib.pyplot as plt
import warnings
import gensim
import numpy as np
warnings.filterwarnings('ignore') 
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from nltk.stem.wordnet import WordNetLemmatizer 
import pyLDAvis.gensim
%matplotlib inline
from sklearn import model_selection
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

In [7]:
train = pd.read_csv(CONST().CLEANED_TRAIN)
test = pd.read_csv(CONST().CLEANED_TEST)

In [8]:
train.shape

(42084, 17)

In [9]:


def preprocess(comment):
    """
    Function to build tokenized texts from input comment
    """
    return gensim.utils.simple_preprocess(comment, deacc=True, min_len=3)




In [10]:

#tokenize the comments
train_text=train.tweet.apply(lambda x: preprocess(x))
test_text=test.tweet.apply(lambda x: preprocess(x))





In [14]:

print("Before preprocessing:",train.tweet.iloc[6])
print("After preprocessing:",train.tweet.iloc[6])


Before preprocessing: hey someone else suppos lifestyle also happy fathers day
After preprocessing: hey someone else suppos lifestyle also happy fathers day


In [15]:
#Phrases help us group together bigrams :  new + york --> new_york
bigram = gensim.models.Phrases(train_text)

In [18]:
bigram[train_text.iloc[6]]

['hey', 'someone_else', 'suppos', 'lifestyle', 'also', 'happy_fathers', 'day']

In [19]:
lem = WordNetLemmatizer()

In [20]:
def clean(word_list):
    """
    Function to clean the pre-processed word lists 
    
    Following transformations will be done
    1) Stop words removal from the nltk stopword list
    2) Bigram collation (Finding common bigrams and grouping them together using gensim.models.phrases)
    3) Lemmatization (Converting word to its root form : babies --> baby ; children --> child)
    """
    #remove stop words
    
    clean_words = [w.lower() for w in word_list if not w.lower() in ["will","it","they","face","oh","be","get","make","not","get"]]
    #collect bigrams
    clean_words = bigram[clean_words]
    #Lemmatize
    clean_words=[lem.lemmatize(word, "v") for word in clean_words]
    return(clean_words)

In [21]:
train_text=train_text.apply(lambda x:clean(x))
test_text=test_text.apply(lambda x:clean(x))

In [22]:
#create the dictionary
dictionary = Dictionary(train_text)
print("There are",len(dictionary),"number of words in the final dictionary")


There are 23495 number of words in the final dictionary


In [23]:
corpus = [dictionary.doc2bow(text) for text in train_text]


In [24]:
print(dictionary.doc2bow(train_text.iloc[6]))
print("Wordlist from the sentence:",train_text.iloc[6])

[(14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)]
Wordlist from the sentence: ['hey', 'someone_else', 'suppos', 'lifestyle', 'also', 'happy_fathers', 'day']


In [28]:
# from gensim.models import HdpModel
# hdp = HdpModel(corpus, dictionary)

In [29]:
#by default, it prints only top 20
#len(hdp.print_topics())

In [25]:
ldamodel = LdaModel(corpus=corpus, num_topics=4,id2word=dictionary)

In [26]:
pyLDAvis.enable_notebook()

In [30]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

### Chart Description:

The Left side shows the multi-dimensional "word-space" superimposed on two "Principal components" and the relative positions of all the topics.

The size of the circle represents what % of the corpus it contains.

The right side shows the word frequencies within the topic and in the whole corpus.


In [31]:
topic_probability_matix = ldamodel[corpus]

In [39]:
ldatopics = ldamodel.show_topics(formatted=False)


In [40]:
ldatopics

[(0,
  [('bitch', 0.082302615),
   ('hoe', 0.026507454),
   ('fuck', 0.01801156),
   ('get', 0.01658544),
   ('pussy', 0.013648728),
   ('with_tears', 0.010657492),
   ('shit', 0.010516213),
   ('joy', 0.010228508),
   ('ass', 0.0100500425),
   ('be', 0.008664905)]),
 (1,
  [('be', 0.011822061),
   ('see', 0.010923931),
   ('more', 0.010624486),
   ('people', 0.008379327),
   ('time', 0.007838259),
   ('happy', 0.0073283627),
   ('thank', 0.006442674),
   ('do', 0.0059652887),
   ('weekend', 0.005822564),
   ('bihday', 0.005457715)]),
 (2,
  [('be', 0.007274078),
   ('do', 0.0068750903),
   ('can', 0.0068457606),
   ('find', 0.0067425123),
   ('make', 0.0061839353),
   ('bitch', 0.0056717424),
   ('kill', 0.0053531094),
   ('would', 0.0051336256),
   ('want', 0.005114406),
   ('think', 0.005085493)]),
 (3,
  [('love', 0.011630794),
   ('live', 0.010026311),
   ('bitch', 0.008872385),
   ('his', 0.008368212),
   ('father', 0.0073687574),
   ('glad', 0.006702144),
   ('her', 0.005653687)

change the number of topics using HDP

In [34]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
 
NUM_TOPICS = 2

# Converting the document to a matrix of token counts

vectorizer = TfidfVectorizer(min_df=4, max_df=0.55, 
                             stop_words=["will","it","they","face","oh","be","get","them","make","not","get","these","tears","laughing","with","was","got","out","joy"], lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(train.tweet)
 
# Build a Latent Semantic Indexing Model using SVD

lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)


  token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


(42084, 2)


In [35]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LSI Model:
Topic 0:
[('bitch', 0.9184027062580317), ('fuck', 0.14046270840183833), ('ass', 0.12476286402668048), ('nigga', 0.08893666743092896), ('bitches', 0.07835458626791146), ('loud', 0.06747326945085945), ('shit', 0.06693793879385741), ('pussy', 0.06238390746857703), ('hoes', 0.05943073486460264), ('hoe', 0.05577086465018751)]
Topic 1:
[('bitches', 0.4688247309196383), ('hoes', 0.3760585819469094), ('happy', 0.3613365599110327), ('love', 0.2443414050696006), ('pussy', 0.22202063218413776), ('hoe', 0.18105922066956978), ('niggas', 0.10434186163794452), ('fathers', 0.10259585385096257), ('good', 0.1014454127462097), ('ass', 0.09186454891844746)]
