#### Here, the objective is to find various topics the documents in the corpus belong to. 
#### Each document must belong to atleast one topic. The process that is used here for topic modeling is called Latent Dirichlet Allocation (LDA). 

#### To do LDA, 2 parameters need to be specified by us
#### 1. A document-term matrix
#### 2. no. of topics we would like for the algorithm to pick

In [8]:
# Reading in the document-term matrix 

import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,ability,abject,able,...,yummy,yyou,ze,zealand,zeppelin,zero,zillion,zombie,zone,zoo
ali,0,0,0,0,0,0,1,0,0,2,...,0,1,0,0,0,0,0,1,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
bill,1,0,0,0,0,0,1,0,0,1,...,1,0,1,0,0,1,1,2,1,0
bo,0,1,1,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hasan,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
jim,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
joe,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
louis,0,0,0,0,0,3,0,0,0,1,...,0,0,0,0,0,2,0,0,0,0


In [11]:
from gensim import matutils, models
import scipy.sparse

tdm = data.transpose()
tdm.head()

Unnamed: 0,ali,anthony,bill,bo,dave,hasan,jim,joe,john,louis,mike,ricky
aaaaah,0,0,1,0,0,0,0,0,0,0,0,0
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaah,0,0,0,0,1,0,0,0,0,0,0,0


In [12]:
# Creating a sparse matrix from the popular term-document matrix format

sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [13]:
# Loading the dictionary of all the terms and their respective location in the term-document matrix

cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In the next section, we specify two other parameters - the no. of topics and the no. of passes

In [14]:
lda = models.LdaModel(corpus = corpus, id2word = id2word, num_topics = 3, passes = 10)
lda.print_topics()

[(0,
  '0.010*"yeah" + 0.007*"shit" + 0.007*"day" + 0.007*"love" + 0.006*"little" + 0.006*"woman" + 0.006*"kid" + 0.006*"life" + 0.005*"old" + 0.005*"cause"'),
 (1,
  '0.009*"shit" + 0.006*"kid" + 0.006*"man" + 0.006*"dad" + 0.005*"walk" + 0.005*"tell" + 0.005*"woman" + 0.005*"really" + 0.005*"life" + 0.005*"day"'),
 (2,
  '0.008*"joke" + 0.007*"tell" + 0.006*"cause" + 0.006*"yeah" + 0.006*"day" + 0.005*"really" + 0.005*"mean" + 0.005*"jenny" + 0.005*"point" + 0.005*"friend"')]

In [16]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.009*"shit" + 0.007*"yeah" + 0.006*"cause" + 0.006*"day" + 0.006*"woman" + 0.006*"man" + 0.005*"really" + 0.005*"kid" + 0.005*"tell" + 0.005*"love"'),
 (1,
  '0.009*"joke" + 0.008*"dad" + 0.007*"yeah" + 0.007*"day" + 0.007*"love" + 0.006*"year" + 0.006*"tell" + 0.006*"kid" + 0.005*"little" + 0.004*"life"'),
 (2,
  '0.011*"life" + 0.009*"tit" + 0.008*"cause" + 0.007*"shit" + 0.007*"old" + 0.007*"kid" + 0.006*"happen" + 0.006*"woman" + 0.006*"bad" + 0.006*"murder"'),
 (3,
  '0.001*"love" + 0.001*"day" + 0.001*"yeah" + 0.001*"shit" + 0.000*"joke" + 0.000*"kid" + 0.000*"little" + 0.000*"let" + 0.000*"tell" + 0.000*"way"')]

The topics don't look very good. We may need to tweak our parameters and the terms list as well.

### Topic modeling using nouns only

In [17]:
from nltk import word_tokenize, pos_tag

def nouns(text):
    is_noun = lambda pos : pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)

In [18]:
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

Unnamed: 0,transcripts
ali,lady gentleman please welcome stage ali wong h...
anthony,thank thank thank san francisco thank much goo...
bill,right thank thank much thank thank thank go th...
bo,bo old macdonald farm e e farm pig e e snort o...
dave,dave tell dirty joke living stare hard work ha...
hasan,’ davis home bring back netflix say want speci...
jim,lady gentleman please welcome stage mr jim jef...
joe,lady gentleman welcome joe rogan fuck go san f...
john,right petunia wish luck die august pretty good...
louis,introfade music let roll hold light light than...


In [20]:
data_nouns = pd.DataFrame(data_clean.transcripts.apply(nouns))
data_nouns

Unnamed: 0,transcripts
ali,please stage ali hi thank hello hello gon caus...
anthony,thank thank francisco thank people tell em gon...
bill,thank thank thank thank pleasure georgia area ...
bo,bo macdonald farm e farm pig e e snort macdona...
dave,tell dirty joke work train fire transforms lev...
hasan,davis home bring york ” son davis california y...
jim,please stage mr jim jefferies sit sit sit sit ...
joe,gentleman joe rogan thanks god damn phone fuck...
john,petunia luck die right hello hello chicago see...
louis,music let roll thank thank appreciate apprecia...


In [23]:
#Creating a new document-term matrix using only the nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']

new_stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)


cvn = CountVectorizer(stop_words = new_stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcripts)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns = cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aaaaah,aaah,aah,abc,ability,abortion,abroad,abstain,abuse,accent,...,youth,youtube,yulin,yummy,yyou,zealand,zeppelin,zillion,zombie,zoo
ali,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
anthony,0,0,0,0,0,2,0,0,0,1,...,0,0,0,0,0,2,0,0,0,0
bill,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,2,0
bo,0,0,0,0,1,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
dave,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hasan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
jim,0,0,0,0,0,0,1,0,0,3,...,0,0,0,0,0,0,0,0,0,0
joe,0,0,0,0,0,0,0,0,1,2,...,0,2,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
louis,0,0,1,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [24]:
# Creating the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Creating the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [25]:
# Doing two topics first
ldan = models.LdaModel(corpus = corpusn, num_topics = 2, id2word = id2wordn, passes = 10)
ldan.print_topics()

[(0,
  '0.012*"thing" + 0.010*"gon" + 0.009*"day" + 0.008*"man" + 0.007*"year" + 0.007*"fuck" + 0.007*"way" + 0.007*"woman" + 0.007*"cause" + 0.006*"life"'),
 (1,
  '0.014*"thing" + 0.012*"guy" + 0.011*"gon" + 0.010*"day" + 0.009*"life" + 0.009*"woman" + 0.008*"cause" + 0.008*"look" + 0.007*"fuck" + 0.007*"man"')]

In [26]:
# Doing three topics first
ldan = models.LdaModel(corpus = corpusn, num_topics = 3, id2word = id2wordn, passes = 10)
ldan.print_topics()

[(0,
  '0.015*"gon" + 0.014*"thing" + 0.013*"guy" + 0.012*"cause" + 0.011*"life" + 0.008*"kind" + 0.008*"day" + 0.007*"way" + 0.007*"woman" + 0.007*"look"'),
 (1,
  '0.016*"thing" + 0.013*"day" + 0.009*"fuck" + 0.008*"year" + 0.008*"guy" + 0.007*"look" + 0.007*"dad" + 0.007*"life" + 0.007*"way" + 0.007*"kid"'),
 (2,
  '0.013*"gon" + 0.013*"man" + 0.011*"woman" + 0.009*"guy" + 0.009*"thing" + 0.009*"fuck" + 0.008*"lot" + 0.007*"shit" + 0.006*"day" + 0.006*"way"')]

In [27]:
# Doing four topics first
ldan = models.LdaModel(corpus = corpusn, num_topics = 4, id2word = id2wordn, passes = 10)
ldan.print_topics()

[(0,
  '0.011*"man" + 0.010*"fuck" + 0.009*"bo" + 0.009*"repeat" + 0.008*"thing" + 0.007*"eye" + 0.007*"world" + 0.007*"gon" + 0.007*"stuff" + 0.006*"house"'),
 (1,
  '0.011*"day" + 0.011*"dad" + 0.009*"guy" + 0.008*"thing" + 0.008*"school" + 0.008*"life" + 0.007*"lot" + 0.007*"year" + 0.007*"mom" + 0.007*"joke"'),
 (2,
  '0.017*"gon" + 0.015*"thing" + 0.013*"guy" + 0.011*"life" + 0.010*"woman" + 0.010*"cause" + 0.010*"man" + 0.009*"look" + 0.009*"kid" + 0.009*"day"'),
 (3,
  '0.017*"thing" + 0.013*"day" + 0.010*"cause" + 0.009*"woman" + 0.009*"fuck" + 0.008*"year" + 0.007*"guy" + 0.007*"gon" + 0.007*"way" + 0.006*"look"')]

#### Looking at the model above, we can idenitfy 4 different topics
#### Topic 01 - 'world'ly stuff
#### Topic 02 - 'Mom' and 'Dad' things
#### Topic 03 - 'Man' and 'woman' thing
#### Topic 04 - All swearing