### インポート

In [20]:
 #-*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import gensim
from pprint import pprint
from sklearn.cluster import KMeans

### 事前付与ジャンルのリストを作成

In [2]:
pre_genre = ["csAI", "csCC", "csCG", "csCL", "csCV", "csDS", "csGT", "csMA", "csSD", "statAP", "statCO", "statME", "statML", "statTH"]

### CSV読み込み

In [3]:
df = pd.DataFrame()

for genre in pre_genre:
    df = pd.concat([df, pd.read_csv("./csv/" +  genre + ".csv", header=None)])

### データフレームにカラム名をつける

In [4]:
df.columns = ["プログラム実行日時", "論文更新日時", "論文リンク", "PDFリンク", "論文タイトル", "サマリ", "著者", "事前付与ジャンル"]

In [7]:
summaries = list(df["サマリ"])
titles = list(df["論文タイトル"])

In [9]:
texts = [[word for word in summary.lower().split()]for summary in summaries]
title_texts = [[word for word in title.lower().split()]for title in titles]

### 辞書作成

In [22]:
dictionary = gensim.corpora.Dictionary(texts)
title_dictionary = gensim.corpora.Dictionary(title_texts)

In [30]:
pprint(dictionary)

<gensim.corpora.dictionary.Dictionary object at 0x11fbb0c50>


### 辞書のID数を確認

In [32]:
id_num = len(dictionary.token2id)
pprint(dictionary.token2id)

{'"a': 7668,
 '"activation': 18536,
 '"atomic"': 4313,
 '"bad"': 11776,
 '"bayesian': 13941,
 '"beginning': 8000,
 '"best"': 5515,
 '"bike': 9330,
 '"block-matching"': 12025,
 '"borrowing': 17817,
 '"cabbage".': 15441,
 '"checkmate"': 17365,
 '"chosen"': 18540,
 '"classic"': 12307,
 '"close': 19080,
 '"complement"': 3067,
 '"conj"': 8368,
 '"continuation"': 8577,
 '"coordinate-wise"': 18496,
 '"coordination"': 12593,
 '"countless': 15830,
 '"countless"': 15858,
 '"crowdsourcing"': 12501,
 '"cryptography"': 5535,
 '"curious"': 13511,
 '"curse': 17042,
 '"data-driven"': 18876,
 '"density': 11117,
 '"design"': 12649,
 '"detector"': 2796,
 '"direct': 15389,
 '"easy-hard"': 3434,
 '"embarrassingly': 17238,
 '"emergent': 14027,
 '"entropy': 9827,
 '"epidemic"': 13850,
 '"evaporation"': 13640,
 '"faster"': 11059,
 '"flags"': 13264,
 '"fluid"': 18259,
 '"from': 10438,
 '"generalise': 18584,
 '"greedily"': 7122,
 '"ground': 5125,
 '"half-structure".': 6455,
 '"hand-crafted"': 14988,
 '"how': 16

### Bag of words のコーパスを作成

In [35]:
corpus = [dictionary.doc2bow(text) for text in texts]


In [36]:
tfidf = gensim.models.TfidfModel(corpus)

In [37]:
print (tfidf)

TfidfModel(num_docs=1400, num_nnz=139048)


In [38]:
corpus_tfidf = tfidf[corpus]

### ベクトル化

In [85]:
num_topics = 6
lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics) # initialize an LSI transformation
pprint(lsi.print_topics(10))
print(lsi.num_topics)
corpus_lsi = lsi[corpus_tfidf]
print(len(corpus_lsi))
# for doc in corpus_lsi:
#     print(doc)

[(0,
  '0.111*"model" + 0.110*"data" + 0.096*"algorithm" + 0.096*"models" + '
  '0.094*"problem" + 0.093*"method" + 0.088*"learning" + 0.087*"our" + '
  '0.084*"as" + 0.083*"network"'),
 (1,
  '-0.209*"neural" + -0.140*"deep" + -0.130*"network" + -0.129*"networks" + '
  '0.124*"problem" + 0.115*"graph" + 0.114*"algorithm" + -0.113*"learning" + '
  '-0.109*"training" + -0.107*"features"'),
 (2,
  '-0.164*"regression" + -0.142*"estimators" + -0.133*"data" + 0.128*"graph" + '
  '-0.121*"confidence" + 0.111*"graphs" + -0.107*"bayesian" + '
  '-0.107*"statistical" + -0.106*"estimation" + 0.105*"quantum"'),
 (3,
  '0.308*"quantum" + 0.291*"game" + 0.219*"games" + 0.192*"social" + '
  '0.179*"agents" + 0.148*"dilemmas" + 0.125*"classical" + -0.118*"graph" + '
  '-0.093*"change" + 0.092*"equilibrium"'),
 (4,
  '-0.445*"change" + -0.281*"objective" + -0.216*"point" + -0.215*"points" + '
  '-0.159*"priori." + -0.159*"exercise." + -0.156*"approach" + '
  '-0.154*"losses." + -0.149*"perspectives."

In [92]:
vecs = gensim.matutils.corpus2dense(corpus_lsi, num_terms=1400) #スパースなリスト形式を密にしてsklearnに適用可能な形に
pprint(vecs)

array([[ 0.18540935,  0.14934096,  0.11813298, ...,  0.1065331 ,
         0.09871507,  0.13355429],
       [-0.15525313, -0.06850662,  0.04236405, ...,  0.04914359,
         0.0393586 ,  0.07006208],
       [-0.0117418 ,  0.05349614,  0.0191966 , ..., -0.09445018,
        -0.0046116 , -0.09995169],
       [-0.06237468,  0.00895852,  0.08137032, ..., -0.01929925,
         0.01268705,  0.00228565],
       [ 0.08618374,  0.01348129, -0.04920201, ...,  0.03107605,
         0.03666452,  0.01865902],
       [ 0.03002089, -0.01751991,  0.03086916, ..., -0.0150204 ,
         0.00456004,  0.07816514]], dtype=float32)


### k-means クラスタリング

In [87]:
num_cls = 6
clusters = KMeans(n_clusters=num_cls, random_state=0).fit_predict(vecs)
print(len(clusters))
print(len(summaries))

1400
1400


In [89]:
cls_list = [0] * num_cls

for doc, cls in zip(summaries, clusters):
    print (cls)
    cls_list[cls] += 1
    
print(cls_list)

2
1
3
5
4
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
