In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import gensim

In [2]:
df = pd.read_csv('nightoffullmoon-1.csv')

In [3]:
df = df.dropna()

In [4]:
stop = stopwords.words('english')

In [5]:
documents = df[df['star'] != 5]

In [6]:
corpus=[]
for doc in documents.res:
    text=re.sub('[^a-zA-z]',' ', doc)
    text=text.lower()
    text=text.split()
    text=[PorterStemmer().stem(word) for word in text if not word in stop]
    text=' '.join(text)
    corpus.append(text)

In [7]:
documents['clean_content'] = corpus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
processed_docs = documents.clean_content.str.split()

In [9]:
processed_docs = processed_docs.map(lambda x:x[2:])

In [10]:
processed_docs

3      [hi, sorri, lot, login, problem, suffer, netwo...
6      [hello, dear, madi, thank, comment, need, down...
7      [sorri, hear, feel, way, pleas, send, us, scre...
10     [thank, support, us, appreci, much, payment, p...
11     [happi, hear, fun, play, game, pleas, sure, ra...
                             ...                        
391    [hello, team, night, full, moon, apolog, incon...
392    [hi, come, late, restart, new, game, kind, cor...
394    [thank, review, ^_^, wonder, day, best, luck, ...
396    [invinc, boss, tri, varieti, card, go, commun,...
398    [pleas, send, uid, gpa, email, nofmgam, hotmai...
Name: clean_content, Length: 192, dtype: object

In [11]:
dictionary=gensim.corpora.Dictionary(processed_docs)

In [12]:
dictionary.filter_extremes(no_below=5,no_above=0.2,keep_n=1000)

In [13]:
len(dictionary)

100

In [14]:
dictionary[29]

'star'

In [15]:
bow_corpus=[dictionary.doc2bow(doc) for doc in processed_docs]

In [16]:
len(bow_corpus)

192

In [17]:
from gensim import corpora,models
tfidf=models.TfidfModel(bow_corpus)

In [18]:
corpus_tfidf=tfidf[bow_corpus]

In [19]:
corpus_tfidf[100]

[(28, 0.4624784180100146),
 (47, 0.4624784180100146),
 (49, 0.4522227053883559),
 (50, 0.3922190247248328),
 (53, 0.4624784180100146)]

In [20]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=5, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=4)

In [21]:
lda_model_tfidf.log_perplexity(bow_corpus)

-4.427344848997835

In [22]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.064*"day" + 0.053*"luck" + 0.039*"good" + 0.037*"like" + 0.035*"feedback" + 0.030*"star" + 0.028*"lot" + 0.026*"work" + 0.025*"tri" + 0.024*"hear"


Topic: 1 Word: 0.052*"fun" + 0.044*"comment" + 0.039*"mean" + 0.039*"hi" + 0.038*"version" + 0.036*"love" + 0.036*"payment" + 0.035*"part" + 0.035*"mail" + 0.034*"problem"


Topic: 2 Word: 0.061*"ad" + 0.048*"payment" + 0.047*"commun" + 0.044*"opportun" + 0.044*"unavoid" + 0.042*"unlock" + 0.042*"part" + 0.041*"new" + 0.040*"bring" + 0.040*"activ"


Topic: 3 Word: 0.045*"contact" + 0.042*"hotmail" + 0.042*"nofmgam" + 0.041*"send" + 0.034*"email" + 0.033*"give" + 0.032*"problem" + 0.030*"sorri" + 0.027*"hear" + 0.027*"greet"


Topic: 4 Word: 0.068*"experi" + 0.065*"better" + 0.065*"promis" + 0.055*"rate" + 0.053*"give" + 0.030*"fix" + 0.030*"file" + 0.023*"tri" + 0.020*"obb" + 0.020*"see"




In [25]:
topic=[]
for i in range(len(corpus_tfidf)):
    a = lda_model_tfidf[corpus_tfidf[i]]
    a.sort(key=lambda x: x[1],reverse=True)
    topic.append([l[0] for l in a if l[1]>0.1])

In [26]:
T = []
for t in topic:
    T.extend(t)

In [27]:
for i in set(T):
    print(f'Topic {i} have {T.count(i)} times')

Topic 0 have 62 times
Topic 1 have 69 times
Topic 2 have 93 times
Topic 3 have 83 times
Topic 4 have 79 times


In [28]:
documents['topic'] = topic

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [30]:
documents.to_csv('respones.csv')