In [1]:
import os
import glob
import time

from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import POSKeepFilter
from sklearn import datasets, model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,TensorDataset
%matplotlib inline

class Word: ##一文の解析
    def __init__(self,Text):
        import pandas as pd
        import numpy as np
        ##text2df
        text = Text.replace('\n','').replace(' ','')
        t = Tokenizer()
        tokens = t.tokenize(text)
        lis = []
        for n in tokens :
            word = n.surface                             ## 表層形
            part_of_speech = n.part_of_speech.split(',') ## 品詞 [品詞,品詞細分類1,品詞細分類2,品詞細分類3]
            infl_type = n.infl_type                      ## 活用型
            infl_form = n.infl_form                      ## 活用形
            base_form = n.base_form                      ## 基本型
            reading = n.reading                          ## 読み
            phonetic = n.phonetic                        ## 発音
            lis.append([word, part_of_speech[0], part_of_speech[1], part_of_speech[2], part_of_speech[3], 
                        infl_type, infl_form, base_form, reading, phonetic,len(word), len(reading)])
        self.df = pd.DataFrame(lis)
        self.df.columns = ['word', 'part_of_speech', 'type1', 'type2', 'type3',
                      'infl_type', 'infl_form', 'base_form', 'reading', 'phonetic',
                      'number_of_word', 'number_of_reading']
        
    def output_df(self):
        return self.df
        
    def part_s(self):
        self._part_s = list(self.df[self.df.part_of_speech == '名詞'].word)
        return self._part_s
    def part_v(self):
        self._part_v = list(self.df[self.df.part_of_speech == '動詞'].word)
        return self._part_v
    def part_o(self):
        self._part_o == list(self.df[self.df.part_of_speech == '形容詞'].word)
        return self._part_o
    
    def list_s2str(self):
        self.str_s = str(self.part_s()).replace("', '"," ")[2:-2]
        return self.str_s
    def list_v2str(self):
        self.str_v = str(self.part_v()).replace("', '"," ")[2:-2]
        return self.str_v
    def list_o2str(self):
        self.str_o = str(self.part_o()).replace("', '"," ")[2:-2]
        return self.str_o
    
class Word2value(Word):
    def __init__(self,csv):
        x_ls = [Word(i).list_s2str() for i in csv.X]
        y_ls = list(csv.Y)

        self.x_array = np.array(x_ls)
        self.y_array = np.array(y_ls)
        
        self.cntvec = CountVectorizer()
        x_cntvecs = self.cntvec.fit_transform(self.x_array)
        self.x_cntararry = x_cntvecs.toarray()
        
        tfidf_vec = TfidfVectorizer(use_idf = True)
        x_tfidf_vecs = tfidf_vec.fit_transform(self.x_array)
        self.x_tfidf_array = x_tfidf_vecs.toarray()
        
    def len_all_term(self):
        return len(self.x_tfidf_array[0])
    def len_all_documents(self):
        return len(self.x_tfidf_array)
    def output_term_columns(self):
        word_array = list(np.zeros(len(self.x_tfidf_array[0])))
        for k,v in sorted(self.cntvec.vocabulary_.items(),key = lambda x:x[1]):
            word_array[int(v)] = str(k)
        return word_array
    def IDF(self):
        ## 総文書数 len(self.x_cntararry[:,i])
        ##ある単語が出現する文書数 sum(self.x_cntararry[:,i] != 0) 
        tf = self.x_cntararry
        self.idf = [np.log10(len(tf[:,i])/sum(tf[:,i] != 0) ) + 1 for i in range(len(tf[0]))]
        return self.idf
        
    def TF(self):
        return self.x_cntararry   ## columns = Term
    
    def TF_IDF(self): 
        ##TF-IDF値
        '''
            TFにIDF(Inverse Document Frequency)をかけたもの。
            IDF値はlog(総文書数/ある単語が出現する文書数)
        '''
        return self.x_tfidf_array
    

## making of Network
class Net(nn.Module):
    def __init__(self,data_len):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(data_len,256) ##線形変換(y = Ax + b)を行う bias = True バイアスを学習する
        self.fc2 = nn.Linear(256,256)      ##入力データの件数、出力データの件数を引数とする
        self.fc3 = nn.Linear(256,256)
        self.fc4 = nn.Linear(256,128)
        self.fc5 = nn.Linear(128,128)
        self.fc6 = nn.Linear(128,2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x)) ## ReLU_function
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return F.log_softmax(x) ##softmax_function

def deep_learning(data1, data2):

    '''
    data1 = 説明変数(学習用)
    data2 = 目的変数(学習用)
    '''
    Start = time.time() 
    # テンソルの作成　及び　ミニバッチの作成
    ## test_data, train_dataの分類
    train_x, test_x, train_y, test_y = model_selection.train_test_split(data1, data2, test_size = 0.2)

    # テンソルの作成

    ## train_data
    Train_x = torch.from_numpy(train_x).float()
    Train_y = torch.from_numpy(train_y).long()

    ## teat_data
    Test_x = torch.from_numpy(test_x).float()
    Test_y = torch.from_numpy(test_y).long()

    ## marge train_data(setumei, mokuteki)
    train = TensorDataset(Train_x, Train_y)

    ##separate minibatch
    train_lorder = DataLoader(train, batch_size = 100, shuffle = True)

    # モデルの学習

    model = Net(len(data1[0])) ##model

    ## Loss_function
    criterion = nn.CrossEntropyLoss()

    ##最適化関数のセット
    optimizer = optim.Adam(model.parameters(), lr = 0.01) ##lr = 学習率

    ##start_learning
    num = 1000 ##学習回数
    start = time.time() 
    for epoch in range(num):
        total_loss = 0
        ##　take_out of sparated_data
        for train_x, train_y in train_lorder:
            ## Construction of Calculation graph
            train_x, train_y = Variable(train_x),Variable(train_y)
            ## reseting of Slope
            optimizer.zero_grad()
            ## Calculation of　propagation（順伝播）
            output = model(train_x)
            ## Calculation of loss
            loss = criterion(output, train_y)
            ## Calculation of　Counterpropagation（逆伝播）
            loss.backward()
            ##updata of weighting
            optimizer.step()
            ## Accumulation of loss
            total_loss += loss.data[0]
        #累積誤差を100回ごとに表示
        if (epoch+1)%100 == 0:
            end = time.time()
            time_diff = end - start
            print(epoch+1, total_loss, time_diff)
            start = time.time() 
    print()
    End = time.time()
    Time_diff = End - Start
    print(f'time = {Time_diff}')
    print('finished learning')
    print()

    # 精度の確認
    ## Construction of Calculation graph
    test_x, test_y = Variable(Test_x), Variable(Test_y)

    ##出力が０と１のどちらか
    result = torch.max(model(test_x).data, 1)[1] ##入力テンソルの最大値を返す

    ## モデルの精度を計算
    accuray = sum(test_y.data.numpy() == result.numpy()) / len(test_y.data.numpy())
    print(f'accuray = {accuray*100}%')
    return model

def Classification(model, array):
    tensor = torch.from_numpy(array).float()
    result = int(torch.max(model(tensor).data, 0)[1])
    return result

# dataの選択

In [2]:
os.getcwd()
file = glob.glob('data/*.csv')
print(file)
csv = pd.read_csv(file[0])

['data/test_data.csv']


# 文字の数値化

In [3]:
Word_value = Word2value(csv)

In [4]:
#
df_tfidf = pd.DataFrame(Word_value.TF_IDF())
df_tfidf.columns = Word_value.output_term_columns()
df_tfidf[:3]

Unnamed: 0,cios,sdcv,コア
0,0.57735,0.57735,0.57735
1,0.57735,0.57735,0.57735
2,0.57735,0.57735,0.57735


In [5]:
data1 = Word_value.x_tfidf_array
data2 = Word_value.y_array

model = deep_learning(data1, data2)



100 tensor(2.7449) 1.5175528526306152
200 tensor(2.7452) 2.1921119689941406
300 tensor(2.7448) 2.8987648487091064
400 tensor(2.7448) 2.6683688163757324
500 tensor(2.7448) 2.706009864807129
600 tensor(2.7450) 2.7700982093811035
700 tensor(2.7452) 2.875519037246704
800 tensor(2.7448) 2.771466016769409
900 tensor(2.7454) 2.7174429893493652
1000 tensor(2.7448) 3.0694987773895264

time = 26.207786798477173
finished learning

accuray = 61.0%


In [6]:
text = 'hosomiとryoutaとコア'

# 文字の数値化
a = Word(text).list_s2str().split(' ')
tf = [a.count(Word_value.output_term_columns()[i]) for i in range(Word_value.len_all_term())]
tfidf = np.array(tf) * Word_value.IDF()

## 分類
result = Classification(model, tfidf)
result



0