In [1]:
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd


In [2]:
data = pd.read_csv('reddit-data/train_sw.csv')

In [3]:
data.head()

Unnamed: 0,body,Unnamed: 1,Unnamed: 2
0,New wiki on how to avoid accidentally encourag...,,
1,I'm sorry did you say suicide fetishists? As i...,,
2,"I didn’t necessarily see this addressed, perha...",,
3,"I read so much about what NOT to do, what not ...",,
4,"For the longest time, I thought this was a sub...",,


In [4]:
to_drop = ['Unnamed: 1', 'Unnamed: 2']
data_f = data.drop(to_drop, axis = 'columns')

In [5]:
data_f.head()

Unnamed: 0,body
0,New wiki on how to avoid accidentally encourag...
1,I'm sorry did you say suicide fetishists? As i...
2,"I didn’t necessarily see this addressed, perha..."
3,"I read so much about what NOT to do, what not ..."
4,"For the longest time, I thought this was a sub..."


In [6]:
# data

# Input 

In [7]:
text = 'Most hotlines are at least partially staffed by trained volunteers, so check out agencies in your area.'

In [8]:
pred = {'body': text}
data = data_f.append(pred, ignore_index = True);

In [9]:
data

Unnamed: 0,body
0,New wiki on how to avoid accidentally encourag...
1,I'm sorry did you say suicide fetishists? As i...
2,"I didn’t necessarily see this addressed, perha..."
3,"I read so much about what NOT to do, what not ..."
4,"For the longest time, I thought this was a sub..."
...,...
4790,"Many toolbars include spell check, like Yahoo ..."
4791,@LambeauOrWrigley\xa0\xa0@K.Moss\xa0\nSioux Fa...
4792,How about Felix? He is sure turning into one h...
4793,"You're all upset, defending this hipster band...."


In [10]:
data.head()

Unnamed: 0,body
0,New wiki on how to avoid accidentally encourag...
1,I'm sorry did you say suicide fetishists? As i...
2,"I didn’t necessarily see this addressed, perha..."
3,"I read so much about what NOT to do, what not ..."
4,"For the longest time, I thought this was a sub..."


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
data.shape

(4795, 1)

In [13]:
cv = CountVectorizer(max_df=0.9,min_df=2,stop_words='english')


In [14]:
dtm = cv.fit_transform(data['body'])

In [15]:
dtm

<4795x6855 sparse matrix of type '<class 'numpy.int64'>'
	with 59886 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.decomposition import LatentDirichletAllocation

In [17]:
LDA = LatentDirichletAllocation(n_components=3,random_state=32)

In [18]:
LDA.fit(dtm)

LatentDirichletAllocation(n_components=3, random_state=32)

In [19]:
len(cv.get_feature_names())

6855

In [20]:
type(cv.get_feature_names())

list

In [21]:
import random
random_word_id = random.randint(0,1610)
cv.get_feature_names()[random_word_id]

'circumstances'

In [22]:
len(LDA.components_)

3

In [23]:
type(LDA.components_)

numpy.ndarray

In [24]:
LDA.components_.shape

(3, 6855)

In [25]:
single_topic = LDA.components_[0]

In [26]:
single_topic.argsort()

array([5385, 3818, 6830, ..., 4373, 6774, 6684], dtype=int32)

In [27]:
import numpy as np

In [28]:
single_topic.argsort()[-10:]

array([3553, 6674, 6042, 3260, 1770, 3418, 3187, 4373, 6774, 6684],
      dtype=int32)

In [29]:
top_ten_words = single_topic.argsort()[-20:]

In [30]:
for index in top_ten_words:
    print(cv.get_feature_names()[index])

going
really
say
want
feel
time
good
right
way
xe2
make
x80
think
know
don
like
just
people
xc2
xa0


In [31]:
for i,topic in enumerate(LDA.components_):
    print(f"top 30 words for topic #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-30:]])
    print('\n')
    print('\n')
    

top 30 words for topic #0
['ll', 've', 'point', 'need', 'did', 'does', 'obama', 'years', 'post', 'better', 'going', 'really', 'say', 'want', 'feel', 'time', 'good', 'right', 'way', 'xe2', 'make', 'x80', 'think', 'know', 'don', 'like', 'just', 'people', 'xc2', 'xa0']




top 30 words for topic #1
['yes', 'comment', 'good', 'suck', 'fucking', 'want', 'oh', 'love', 'stupid', 'best', 'try', 'life', 'really', 'people', 'got', 'lol', 'dont', 'look', 'little', 'bitch', 'think', 'know', 'need', 'fuck', 'just', 'dumb', 'time', 'don', 'ass', 'like']




top 30 words for topic #2
['little', 'pathetic', 'going', 'people', 'want', 'got', 'im', 'fans', 'way', 'right', 'life', 'white', 'bit', 'really', 'say', 'trade', 'day', 'shar', 'es', 'man', 'think', 'shit', 'know', 'idiot', 'fucking', 'don', 'fuck', 'http', 'like', 'just']






In [32]:
topic_results = LDA.transform(dtm)

In [33]:
topic_results.shape

(4795, 3)

In [34]:
data['Topic'] = topic_results.argmax(axis=1)

In [35]:
data

Unnamed: 0,body,Topic
0,New wiki on how to avoid accidentally encourag...,0
1,I'm sorry did you say suicide fetishists? As i...,0
2,"I didn’t necessarily see this addressed, perha...",0
3,"I read so much about what NOT to do, what not ...",0
4,"For the longest time, I thought this was a sub...",0
...,...,...
4790,"Many toolbars include spell check, like Yahoo ...",0
4791,@LambeauOrWrigley\xa0\xa0@K.Moss\xa0\nSioux Fa...,0
4792,How about Felix? He is sure turning into one h...,2
4793,"You're all upset, defending this hipster band....",2


In [36]:
len(data)

4795

# Output

In [37]:
def predict(val):
    if val == 0:
        print('Suicidal')
    elif val == 1:
        print('Abusive/Threatening')
    else:
        print('Completely Normal')

In [38]:
val = data['Topic'][len(data)-1]
predict(val)

Completely Normal
