### インポート

In [1]:
 #-*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import gensim
from sklearn.cluster import KMeans

### 事前付与ジャンルのリストを作成

In [22]:
pre_genre = ["csAI", "csCC", "csCG", "csCL", "csCV", "csDS", "csGT", "csMA", "csSD", "statAP", "statCO", "statME", "statML", "statTH"]

### CSV読み込み

In [46]:
df = pd.DataFrame()

for genre in pre_genre:
    df = pd.concat([df, pd.read_csv("./csv/" +  genre + ".csv", header=None)])

### データフレームにカラム名をつける

In [47]:
df.columns = ["プログラム実行日時", "論文更新日時", "論文リンク", "PDFリンク", "論文タイトル", "サマリ", "著者", "事前付与ジャンル"]

In [48]:
documents = list(df["サマリ"]) #リスト化

In [49]:
texts = [[word for word in document.lower().split()]for document in documents] #単語ごとに抽出

### 辞書作成

In [50]:
dictionary = gensim.corpora.Dictionary(texts)

In [64]:
print (dictionary)

Dictionary(19441 unique tokens: ['deep-layered', 'models', 'trained', 'on', 'a']...)


### 辞書のID数を確認

In [52]:
id_num = len(dictionary.token2id)

### Bag of words のコーパスを作成

In [53]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [54]:
tfidf = gensim.models.TfidfModel(corpus)

In [55]:
print (tfidf)

TfidfModel(num_docs=1400, num_nnz=139048)


In [56]:
corpus_tfidf = tfidf[corpus]

### ベクトル化

In [57]:
vecs = gensim.matutils.corpus2dense(corpus_tfidf, num_terms=id_num) #スパースなリスト形式を密にしてsklearnに適用可能な形に

### k-means クラスタリング

In [69]:
num_cls = 8
clusters = KMeans(n_clusters=num_cls, random_state=0).fit_predict(vecs)

In [77]:
cls_list = [0] * num_cls

for doc, cls in zip(documents, clusters):
    print (cls)
    cls_list[cls] += 1
    
    

7
9
6
9
2
1
9
2
7
1
7
2
6
9
6
9
9
2
2
2
9
6
7
2
7
2
6
7
6
6
6
9
7
6
6
9
5
9
6
9
2
9
6
2
7
1
5
9
2
7
1
7
2
9
2
7
7
8
7
6
9
9
2
7
7
9
9
2
7
7
7
5
6
7
9
7
9
9
1
9
2
9
7
9
9
9
9
7
9
7
6
6
9
9
9
9
1
7
9
6
6
7
9
2
9
9
2
6
2
7
7
6
7
5
7
1
2
9
2
6
6
2
6
7
6
7
5
7
2
2
5
6
2
7
7
6
9
1
2
7
1
2
7
2
2
3
9
2
2
7
7
7
9
9
2
7
9
2
7
7
2
2
9
9
9
7
6
2
7
2
2
7
6
8
9
2
7
9
7
2
2
7
9
2
7
2
2
2
8
2
7
7
7
2
2
2
2
2
2
2
9
2
2
2
2
9
7
7
7
7
7
2
7
7
2
2
7
7
7
7
7
9
2
2
7
9
7
2
2
9
7
7
1
7
2
2
2
2
2
7
7
7
9
9
9
2
9
7
7
7
2
7
7
7
7
7
7
6
9
9
7
9
2
2
9
9
6
7
8
7
2
7
2
7
7
7
7
2
7
6
7
9
7
2
2
7
2
7
7
7
2
9
7
7
9
7
7
9
7
7
2
7
7
1
7
7
7
9
7
2
7
7
2
9
7
9
9
2
7
7
2
9
7
7
7
7
6
2
2
7
6
6
6
6
6
7
7
7
7
2
7
2
2
7
6
6
7
7
7
2
6
7
7
7
9
2
7
7
8
2
9
7
7
7
9
7
7
7
9
7
7
2
7
7
7
6
6
7
2
7
2
6
7
7
7
7
7
7
7
6
6
7
7
7
7
7
2
2
7
2
7
6
2
9
2
7
7
9
2
7
7
7
7
9
9
2
7
7
7
3
7
2
9
2
7
7
7
7
7
7
9
6
9
2
7
2
7
7
7
7
7
2
2
2
2
9
7
7
7
9
2
2
9
9
5
7
7
7
9
2
2
7
2
5
7
7
2
2
9
7
2
2
7
7
2
7
8
5
1
7
7
7
7
7
7
7
7
9
2
6
2
2
7
7
2
7
2
7
2
7


In [78]:
print(cls_list)

[1, 25, 385, 8, 0, 21, 72, 704, 41, 143]
