https://towardsdatascience.com/the-complete-guide-for-topics-extraction-in-python-a6aaa6cedbbc


In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import pickle

from konlpy.tag import Kkma

from gensim import corpora, models
import pyLDAvis.gensim



# 1. Tokenize and Embed

In [2]:
df = pd.read_csv('whole_profiles.csv', usecols=['genre','keywords','plot'])

In [3]:
lines = df['plot'].tolist()
len(lines)

6504

In [10]:
from threading import Thread
import jpype


lines = df['plot'].tolist()

kkma = Kkma()

def do_concurrent_tagging (start, end, lines, result):
    jpype.attachThreadToJVM()
    l = [ [i]+kkma.pos(lines[i]) for i in range(start, end)]
    result.append(l)
    return


if __name__ == "__main__":
    import time
    import datetime as dt
    nlines = len(lines)
    print('Number of document:  ',nlines)
    t = time.clock()
    
    print('Concurrent tagging: ')
    result = []
    t1 = Thread(target=do_concurrent_tagging, args = (0,int(nlines/2), lines, result))
    t2 = Thread(target=do_concurrent_tagging, args = (int(nlines/2), nlines, lines, result))
    t1.start() ; t2.start()
    t1.join() ; t2.join()
    
    result = sum(result, [])
        
    print(dt.timedelta(seconds=int(time.clock()-t)))
    
print(len(result), ' documents has chopped into words.')


Number of document:   6504
Concurrent tagging: 
0:05:39
6504  documents has chopped into words.


In [168]:
df['keywords'].loc[0].split(',')+df['genre'].loc[0].split(',')

['계모', '권선징악', '무성영화', '복수', '연쇄극', '키노드라마', 'Kino Drama', '드라마', '활극', '연쇄극']

In [400]:
sortresult = [' '] * 6504

for doc in result:
    sortresult[doc[0]] = doc[1:]
    
print('Documents are splitted into ',len(sortresult),'.')

# nonstops = ['NNG', 'XR','VV','VA']
nonstops = ['NNG']
print(nonstops,' will be included.')

lists_of_words = []
for doc in sortresult:
    lists_of_words.append([bag[0] for bag in doc if bag[1] in nonstops and len(bag[0]) !=1])
print(len(lists_of_words),' documents have been become lists of words.')

keygenr_list = []
for idx in range(len(df)):
    keygenr = df['keywords'].loc[idx].split(',') +df['genre'].loc[idx].split(',')
    lists_of_words[idx] += keygenr
    keygenr_list += keygenr

keygenr_list = [word.strip() for word in keygenr_list ]
keygenr_list = list(set(keygenr_list))
print('"genre" and "keywords" are added.')

Documents are splitted into  6504 .
['NNG']  will be included.
6504  documents have been become lists of words.
"genre" and "keywords" are added.


In [334]:
len(lists_of_words[0][0]) == 1

False

# 2. Embeding 

In [411]:
lda_dict = corpora.Dictionary(lists_of_words)
lda_dict.filter_extremes(no_below = 15, no_above = 0.1, keep_tokens=keygenr)

In [412]:
corpus = [lda_dict.doc2bow(lists) for lists in lists_of_words]

In [413]:
lda_dict.num_docs

6504

In [414]:
print(lda_dict)

Dictionary(2631 unique tokens: ['가문', '계모', '고통', '권선징악', '권유']...)


# 3. Modeling

###  test data

In [229]:
whole = pd.read_csv('whole_profiles.csv', usecols=['movieId','movieSeq'])
vod = pickle.load(open('codes.plk','rb'))

In [228]:
len(vod)

348

In [279]:
i, s = [Id[0] for Id in vod], [Id[1] for Id in vod]

In [286]:
vod_df = pd.DataFrame(data = {'movieId':i, 'movieSeq':s})

In [292]:
whole.reset_index(drop=False, inplace=True)

In [298]:
test = pd.merge(whole, vod_df, how='inner', )['index'].tolist()


In [301]:
train = [idx for idx in range(len(whole)) if idx not in test]

In [302]:
len(train), len(test)

(6166, 338)

In [434]:
X_test = [corpus[idx] for idx in test]
len(X_test)

338

### train data & validation data

In [305]:
validat = np.random.choice(len(train)-1, size = round((len(train)/4)))
train = [idx for idx in range(len(train)) if idx not in validat]

In [306]:
len(validat),len(train)

(1542, 4817)

In [416]:
validat_set = [lists_of_words[idx] for idx in validat]
train_set = [lists_of_words[idx] for idx in train]
len(validat_set),len(train_set)

(1542, 4817)

In [425]:
X_validat = [corpus[idx] for idx in validat]
X_train = [corpus[idx] for idx in train]
len(X_validat),len(X_train)

(1542, 4817)

## 3.1 base model with 30 topics

In [436]:
topics_n = 30

%time lda_model_30 = models.LdaModel(X_train, num_topics= topics_n, id2word=lda_dict,\
                                 passes=15, random_state=51)


Wall time: 24.1 s


In [439]:
topics = lda_model_30.print_topics(num_words=10, num_topics=30)
for topic in topics:
    print(topic)
    print('---------------------------------------------------')
    
print(lda_model_30.log_perplexity(X_train))

(0, '0.040*"공장" + 0.034*"숙은" + 0.022*"노동자" + 0.020*"가게" + 0.019*"동진" + 0.019*"광고" + 0.019*"여름" + 0.017*"태수" + 0.017*"목사" + 0.015*"구원"')
---------------------------------------------------
(1, '0.108*"사장" + 0.026*"사진" + 0.016*"회사" + 0.015*"회장" + 0.015*"제자" + 0.014*"야구" + 0.014*"단편" + 0.012*"주인" + 0.011*"한편" + 0.011*"소매치기"')
---------------------------------------------------
(2, '0.063*"고향" + 0.029*"서울" + 0.023*"병원" + 0.022*"마을" + 0.015*"영자" + 0.014*"시골" + 0.014*"가난" + 0.013*"통속" + 0.013*"주사" + 0.012*"청년"')
---------------------------------------------------
(3, '0.035*"동생" + 0.027*"유진" + 0.025*"영아" + 0.019*"시어머니" + 0.017*"스타" + 0.016*"수지" + 0.015*"상호" + 0.015*"시집" + 0.013*"만석" + 0.013*"행복"')
---------------------------------------------------
(4, '0.058*"전쟁" + 0.051*"소녀" + 0.025*"반공/분단" + 0.019*"부대" + 0.017*"한국전쟁" + 0.017*"625" + 0.016*"북한" + 0.015*"북한군" + 0.014*"작전" + 0.014*"6.25"')
---------------------------------------------------
(5, '0.024*"여행" + 0.018*"수술" + 0.016*"임신" + 0.013*"

In [438]:
lda_display = pyLDAvis.gensim.prepare(lda_model_30, X_train, lda_dict, sort_topics=False)
pyLDAvis.display(lda_display)

In [440]:
lda_display = pyLDAvis.gensim.prepare(lda_model_30, X_validat, lda_dict, sort_topics=False)
pyLDAvis.display(lda_display)