### Document to Text with Space between Words
form: pandas data frame

In [1]:
import re

import pandas as pd
import spacy

In [2]:
nlp = spacy.load('ja_ginza')

In [3]:
D_path = './corpus2/'
topics = ['Gourmet', 'Keitai', 'Kyoto', 'Sports']

In [4]:
cols = ['doc', 'label']
df = pd.DataFrame(index=[], columns=cols)
regex = re.compile(r'(［.*］)(.*)')

for topic in topics:
    data = pd.read_table(D_path + topic + '.tsv', header = None, dtype = str, usecols = [1])
    for d in data.itertuples():
        m = regex.match(d[1])
        # first line of the docment
        if m != None:
            if d[0] != 0:
                df = df.append(pd.DataFrame([doc.strip(), topic], index = cols).T)
            sent = m.groups()[1]
            doc = ''
        else:
            sent = d[1]
        doc = doc+' ' + ' '.join([t.text for t in nlp(sent)])
    df = df.append(pd.DataFrame([doc.strip(), topic], index = cols).T)

In [5]:
df[:3]

Unnamed: 0,doc,label
0,烏丸 六角 の おかき 屋 さん 六角堂 の 前 に ある 、 蕪村 庵 と いう お店 に...,Gourmet
0,河原町 の 居酒屋 この 間 先輩 たち に つれ られ 、 河原町 の 居酒屋 へ 行っ ...,Gourmet
0,ちょっと 贅沢 ほんの ちょこっと な ん だ けど 、 贅沢 し たい とき に 何 を ...,Gourmet


In [6]:
df.shape

(249, 2)

In [7]:
df['label'].value_counts(sort = False)

Kyoto      91
Gourmet    57
Sports     22
Keitai     79
Name: label, dtype: int64

#### DataFrame to CSV

In [8]:
df.to_csv('corpus2/split_data_full.csv')

In [2]:
df = pd.read_csv('corpus2/split_data_full.csv', index_col=0)

In [3]:
df[:3]

Unnamed: 0,doc,label
0,烏丸 六角 の おかき 屋 さん 六角堂 の 前 に ある 、 蕪村 庵 と いう お店 に...,Gourmet
0,河原町 の 居酒屋 この 間 先輩 たち に つれ られ 、 河原町 の 居酒屋 へ 行っ ...,Gourmet
0,ちょっと 贅沢 ほんの ちょこっと な ん だ けど 、 贅沢 し たい とき に 何 を ...,Gourmet


In [4]:
df.shape

(249, 2)

### Neural Network

In [3]:
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning import Trainer

import pkg_resources, imp
imp.reload(pkg_resources)

use_cuda = torch.cuda.is_available() and False
device = torch.device("cuda" if use_cuda else "cpu")

#### 3 Layers Neural Network

In [4]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['label'])
X = df['doc']

vec = TfidfVectorizer()
Xv = vec.fit_transform(X)

#割合均等に分割
train_X, test_X, train_y, test_y = train_test_split(
    Xv, y, test_size=0.3, random_state=0, stratify=y)
print(f'''
train_X: {train_X.shape}
test_X: {test_X.shape}
train_y: {train_y.shape}
test_y: {test_y.shape}
''')
print(pd.Series(train_y.tolist()).value_counts())
print(pd.Series(test_y.tolist()).value_counts())

train_X = torch.Tensor(train_X.todense())
train_y = torch.LongTensor(train_y)
test_X = torch.Tensor(test_X.todense())
test_y = torch.LongTensor(test_y)
train = torch.utils.data.TensorDataset(train_X, train_y)
test = torch.utils.data.TensorDataset(test_X, test_y)


train_X: (174, 7138)
test_X: (75, 7138)
train_y: (174,)
test_y: (75,)

2    64
1    55
0    40
3    15
dtype: int64
2    27
1    24
0    17
3     7
dtype: int64


In [5]:
# model definition
input_size = train_X.shape[1]
output_size = len(list(set(df['label'].tolist())))

class MLP(pl.LightningModule):
    
    def __init__(self, input_size=input_size, hidden_size=10, output_size=output_size, batch_size=10):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.batch_size = batch_size
    
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        return x
    
    def lossfun(self, y, t):
        return F.cross_entropy(y, t)
    
    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr = 0.1)
    
    @pl.data_loader
    def train_dataloader(self):
        return torch.utils.data.DataLoader(train, self.batch_size, shuffle=True, num_workers=4)
    
    def test_dataloader(self):
        return torch.utils.data.DataLoader(test, self.batch_size, shuffle=False, num_workers=4)
    
    def training_step(self, batch, batch_nb):
        x, t = batch
        y = self.forward(x)
        loss = self.lossfun(y, t)
        results = {'loss':loss}
        return results
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        return {'test_loss':F.cross_entropy(y_hat, y)}
    
    def test_epoch_end(self, outputs):
        test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean()
        return {'test_loss':test_loss_mean}

In [6]:
mlp = MLP()
trainer = Trainer(max_nb_epochs=50)
trainer.fit(mlp)
trainer.test()

INFO:lightning:GPU available: False, used: False
INFO:lightning:
  | Name | Type   | Params
----------------------------
0 | fc1  | Linear | 71 K  
1 | fc2  | Linear | 44    


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'test_loss': tensor(0.6301)}
--------------------------------------------------------------------------------



In [11]:
pred_y = []

with torch.no_grad():
    for (docs, labels) in torch.utils.data.DataLoader(test, 1, shuffle=False):
        outputs = mlp(docs)
        _, pred = torch.max(outputs.data, 1)
        pred_y.append(pred.item())
print(
    classification_report(
        test_y, pred_y, target_names=list(set(df['label'].tolist()))
    )
)


              precision    recall  f1-score   support

     Gourmet       0.65      0.76      0.70        17
       Kyoto       1.00      0.92      0.96        24
      Keitai       0.79      0.96      0.87        27
      Sports       0.00      0.00      0.00         7

    accuracy                           0.81        75
   macro avg       0.61      0.66      0.63        75
weighted avg       0.75      0.81      0.78        75



  _warn_prf(average, modifier, msg_start, len(result))


#### Conv Neural Network