### Document to Text with Space between Words
form: pandas data frame

In [1]:
import re

import pandas as pd
import spacy

In [2]:
D_path = './corpus2/'
topics = ['Gourmet', 'Keitai', 'Kyoto', 'Sports']

#### DataFrame to CSV

In [3]:
df = pd.read_csv('corpus2/split_data_full.csv', index_col=0)

In [4]:
df[:3]

Unnamed: 0,doc,label
0,烏丸 六角 の おかき 屋 さん 六角堂 の 前 に ある 、 蕪村 庵 と いう お店 に...,Gourmet
0,河原町 の 居酒屋 この 間 先輩 たち に つれ られ 、 河原町 の 居酒屋 へ 行っ ...,Gourmet
0,ちょっと 贅沢 ほんの ちょこっと な ん だ けど 、 贅沢 し たい とき に 何 を ...,Gourmet


In [6]:
df.shape

(249, 2)

### Neural Network

In [10]:
import imp
import time

import numpy as np
import pkg_resources
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from pytorch_lightning import Trainer
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

imp.reload(pkg_resources)


use_cuda = torch.cuda.is_available() and False
device = torch.device("cuda" if use_cuda else "cpu")

#### 3 Layers Neural Network

In [6]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['label'])
X = df['doc']
print(le.classes_)

vec = TfidfVectorizer()
Xv = vec.fit_transform(X)

train_X, test_X, train_y, test_y = train_test_split(
    Xv, y, test_size=0.3, random_state=1)
print(f'train_X: {train_X.shape}\ntest_X: {test_X.shape}\ntrain_y: {train_y.shape}\ntest_y: {test_y.shape}\n')
print(f'{pd.Series(train_y.tolist()).value_counts()}\n\n{pd.Series(test_y.tolist()).value_counts()}')

train_X = torch.Tensor(train_X.todense())
train_y = torch.LongTensor(train_y)
test_X = torch.Tensor(test_X.todense())
test_y = torch.LongTensor(test_y)
train = torch.utils.data.TensorDataset(train_X, train_y)
test = torch.utils.data.TensorDataset(test_X, test_y)

['Gourmet' 'Keitai' 'Kyoto' 'Sports']
train_X: (174, 7138)
test_X: (75, 7138)
train_y: (174,)
test_y: (75,)

2    65
1    58
0    38
3    13
dtype: int64

2    26
1    21
0    19
3     9
dtype: int64


In [7]:
# model definition
class MLP(pl.LightningModule):
    
    def __init__(self, hparams):
        super(MLP, self).__init__()
        self.hparams = hparams
        self.fc1 = nn.Linear(self.hparams['input_size'], 
                             self.hparams['hidden_size'])
        self.fc2 = nn.Linear(self.hparams['hidden_size'],
                             self.hparams['output_size'])
        self.batch_size = self.hparams['batch_size']
        self.lr = self.hparams['learning_rate']
        self.nw = self.hparams['num_workers']
    
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x
    
    def lossfun(self, y, t):
        return F.cross_entropy(y, t)
    
    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr = self.lr)
    
    @pl.data_loader
    def train_dataloader(self):
        return torch.utils.data.DataLoader(train, self.batch_size, shuffle=True, num_workers=self.nw)
    
    def test_dataloader(self):
        return torch.utils.data.DataLoader(test, self.batch_size, shuffle=False, num_workers=self.nw)
    
    def training_step(self, batch, batch_nb):
        x, t = batch
        y = self.forward(x)
        loss = self.lossfun(y, t)
        results = {'loss':loss}
        return results
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.lossfun(y_hat, y)
        return {'test_loss':loss}
    
    def test_epoch_end(self, outputs):
        test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean()
        return {'test_loss':test_loss_mean}

In [8]:
def train_test(hparams):
    mlp = MLP(hparams)
    trainer = Trainer(max_nb_epochs=hparams['epochs'])
    trainer.fit(mlp)
    trainer.test()

    pred_y = []

    with torch.no_grad():
        for (docs, labels) in torch.utils.data.DataLoader(test, 1, shuffle=False):
            outputs = mlp(docs)
            _, pred = torch.max(outputs.data, 1)
            pred_y.append(pred.item())
    print(
        classification_report(
            test_y, pred_y, target_names=le.classes_
        )
    )

In [38]:


hparams = dict(
    input_size = train_X.shape[1],
    output_size = len(le.classes_),
    hidden_size = 10,
    batch_size = 10,
    learning_rate = 0.1,
    num_workers = 4,
    epochs = 50
)
#default
print('--------------------------------------------------------')
print(hparams)
t=time.time()
train_test(hparams)
print(f'time:{time.time()-t}')
print('--------------------------------------------------------')             

INFO:lightning:GPU available: False, used: False
INFO:lightning:
  | Name | Type   | Params
----------------------------
0 | fc1  | Linear | 71 K  
1 | fc2  | Linear | 44    


--------------------------------------------------------
{'input_size': 7138, 'output_size': 4, 'hidden_size': 10, 'batch_size': 10, 'learning_rate': 0.1, 'num_workers': 4, 'epochs': 50}


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'test_loss': tensor(0.6131)}
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

     Gourmet       0.64      0.95      0.77        19
      Keitai       1.00      0.90      0.95        21
       Kyoto       0.89      0.92      0.91        26
      Sports       1.00      0.11      0.20         9

    accuracy                           0.83        75
   macro avg       0.88      0.72      0.71        75
weighted avg       0.87      0.83      0.80        75

time:214.26565289497375
--------------------------------------------------------


In [11]:
hparams = dict(
    input_size = train_X.shape[1],
    output_size = len(le.classes_),
    hidden_size = 70,
    batch_size = 10,
    learning_rate = 0.1,
    num_workers = 4,
    epochs = 50
)
#default
print('--------------------------------------------------------')
print(hparams)
t=time.time()
train_test(hparams)
print(f'time:{time.time()-t}')
print('--------------------------------------------------------')   

INFO:lightning:GPU available: False, used: False


--------------------------------------------------------
{'input_size': 7138, 'output_size': 4, 'hidden_size': 70, 'batch_size': 10, 'learning_rate': 0.1, 'num_workers': 4, 'epochs': 50}


INFO:lightning:
  | Name | Type   | Params
----------------------------
0 | fc1  | Linear | 499 K 
1 | fc2  | Linear | 284   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'test_loss': tensor(0.6004)}
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

     Gourmet       0.89      0.89      0.89        19
      Keitai       1.00      0.90      0.95        21
       Kyoto       0.71      0.96      0.82        26
      Sports       1.00      0.22      0.36         9

    accuracy                           0.84        75
   macro avg       0.90      0.75      0.76        75
weighted avg       0.87      0.84      0.82        75

time:235.68215537071228
--------------------------------------------------------


In [12]:
# model definition
class MLP(pl.LightningModule):
    
    def __init__(self, hparams):
        super(MLP, self).__init__()
        self.hparams = hparams
        self.fc1 = nn.Linear(self.hparams['input_size'], 
                             self.hparams['hidden_size'])
        self.fc2 = nn.Linear(self.hparams['hidden_size'],
                             self.hparams['output_size'])
        self.batch_size = self.hparams['batch_size']
        self.lr = self.hparams['learning_rate']
        self.nw = self.hparams['num_workers']
    
    def forward(self, x):
        x = self.fc1(x)
        x = F.sigmoid(x)
        x = self.fc2(x)
        return x
    
    def lossfun(self, y, t):
        return F.cross_entropy(y, t)
    
    def configure_optimizers(self):
        return optim.Adagrad(self.parameters(), lr = self.lr)
    
    @pl.data_loader
    def train_dataloader(self):
        return torch.utils.data.DataLoader(train, self.batch_size, shuffle=True, num_workers=self.nw)
    
    def test_dataloader(self):
        return torch.utils.data.DataLoader(test, self.batch_size, shuffle=False, num_workers=self.nw)
    
    def training_step(self, batch, batch_nb):
        x, t = batch
        y = self.forward(x)
        loss = self.lossfun(y, t)
        results = {'loss':loss}
        return results
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.lossfun(y_hat, y)
        return {'test_loss':loss}
    
    def test_epoch_end(self, outputs):
        test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean()
        return {'test_loss':test_loss_mean}

In [13]:
import time

hparams = dict(
    input_size = train_X.shape[1],
    output_size = len(le.classes_),
    hidden_size = 50,
    batch_size = 10,
    learning_rate = 0.1,
    num_workers = 4,
    epochs = 50
)
#default
print('--------------------------------------------------------')
print(hparams)
t=time.time()
train_test(hparams)
print(f'time:{time.time()-t}')
print('--------------------------------------------------------')             

INFO:lightning:GPU available: False, used: False
INFO:lightning:
  | Name | Type   | Params
----------------------------
0 | fc1  | Linear | 356 K 
1 | fc2  | Linear | 204   


--------------------------------------------------------
{'input_size': 7138, 'output_size': 4, 'hidden_size': 50, 'batch_size': 10, 'learning_rate': 0.1, 'num_workers': 4, 'epochs': 50}


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…






HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'test_loss': tensor(0.3778)}
--------------------------------------------------------------------------------

              precision    recall  f1-score   support

     Gourmet       0.90      0.95      0.92        19
      Keitai       0.95      0.95      0.95        21
       Kyoto       0.83      0.96      0.89        26
      Sports       1.00      0.44      0.62         9

    accuracy                           0.89        75
   macro avg       0.92      0.83      0.85        75
weighted avg       0.90      0.89      0.88        75

time:352.79061460494995
--------------------------------------------------------


#### Conv Neural Network