In [None]:
import os
import collections
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from tensorflow import keras
from keras import Sequential,optimizers, regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Activation, Dense, Embedding, Flatten, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint  

from nltk.corpus import stopwords 

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
traindata = pd.read_csv('/content/drive/MyDrive/Ybigta/2021-1/NLP 플젝/cleaned traindata.csv',usecols=['index','text','author'])
traindata

Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen odin flung hands dont write an...,3
...,...,...,...
54874,54874,mr smith odin whispered hardly dared hope woul...,2
54875,54875,told plan captain us settled details accomplis...,4
54876,54876,sincere wellwisher friend sister lucy odin,1
54877,54877,wanted lend money,3


In [None]:
testdata = pd.read_csv('/content/drive/MyDrive/Ybigta/2021-1/NLP 플젝/cleaned testdata.csv',usecols=['index','text'])
testdata

Unnamed: 0,index,text
0,0,think one charming young ladies ever met might...
1,1,replied sudden consciousness find cannot ignor...
2,2,lady stated intention screaming course would s...
3,3,suddenly silence heard sound sent heart mouth ...
4,4,conviction remained unchanged far knowand beli...
...,...,...
19612,19612,end another day two odin growing visibly stron...
19613,19613,afternoon sat together mostly silence watching...
19614,19614,odin carried thanks odin proceeded happiness l...
19615,19615,soon upon odins leaving room mama said odin al...


In [None]:
traindata[['text']]['text']

0        almost choking much much wanted say strange ex...
1                                     sister asked suppose
2        engaged one day walked perusing janes last let...
3        captain porch keeping carefully way treacherou...
4        mercy gentlemen odin flung hands dont write an...
                               ...                        
54874    mr smith odin whispered hardly dared hope woul...
54875    told plan captain us settled details accomplis...
54876           sincere wellwisher friend sister lucy odin
54877                                    wanted lend money
54878                     certainly occurred said yes like
Name: text, Length: 54879, dtype: object

In [None]:
X_train = traindata[['text']]['text']
y_train = np.array(traindata['author'])

X_test = testdata[['text']]['text']

classes = traindata['author'].nunique()

max_len = 256
embedding_dim = 300

In [None]:
X_train = X_train.astype('str')
X_test = X_test.astype('str')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(sequences, maxlen = max_len)
sequences = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(sequences, maxlen = max_len)

In [None]:
word_index = tokenizer.word_index
num_words = len(word_index) + 1
num_words

46910

In [None]:
optimizer = optimizers.Adam(lr=0.001, decay=0.0 )

In [None]:
def CNNmodel():
  model = Sequential([
  Embedding(num_words,embedding_dim,input_length = max_len),
  Conv1D(filters = 64, kernel_size = 7, strides = 1, padding = 'valid',activation='relu'),
  MaxPooling1D(2),
  Conv1D(filters = 64, kernel_size = 7, strides = 1, padding = 'valid',activation='relu'),
  GlobalMaxPooling1D(),
  Flatten(),

  Dense(32, activation = 'relu',kernel_regularizer=regularizers.l2(1e-5)),
  Dropout(0.5),
  Dense(classes, activation = 'softmax')
  ])

  model.compile(loss = 'sparse_categorical_crossentropy', optimizer = optimizer, metrics = ['acc'])

  return model

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(54879, 256)
(19617, 256)
(54879,)


In [None]:
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
folds=[]
for train_idx, valid_idx in skf.split(X_train,y_train):
    folds.append((train_idx, valid_idx))

In [None]:
train_idx

array([    1,     2,     3, ..., 54876, 54877, 54878])

In [None]:
for fold in range(5):
    print(f'training model for CV #{fold+1}')
    train_idx, valid_idx = folds[fold][0], folds[fold][1]
    train_X, valid_X, train_y, valid_y = X_train[train_idx], X_train[valid_idx], y_train[train_idx], y_train[valid_idx]
    
    batch_size=256
    epochs=10

    decay_st = (len(train_X)//batch_size+1)*epochs
    poly_scheduler = optimizers.schedules.PolynomialDecay(1e-3, decay_st, end_learning_rate=1e-6, power=0.9)
    opt = optimizers.Adam(poly_scheduler)

    model = CNNmodel()

    model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    model_name = 'CNNmodel'
    cp_name = 'model/{}-{}'.format(model_name, fold)
    cp = ModelCheckpoint( cp_name+".hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1)
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, verbose=1, mode='min', baseline=None, restore_best_weights=True)

    history = model.fit(train_X, train_y, epochs=epochs,  validation_data=(valid_X, valid_y), batch_size=batch_size, callbacks=[cp,es])

training model for CV #1
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.96690, saving model to model/CNNmodel-0.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 0.96690 to 0.82164, saving model to model/CNNmodel-0.hdf5
Epoch 3/10

Epoch 00003: val_loss improved from 0.82164 to 0.81526, saving model to model/CNNmodel-0.hdf5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.81526
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.81526
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.81526
Restoring model weights from the end of the best epoch.
Epoch 00006: early stopping
training model for CV #2
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.96144, saving model to model/CNNmodel-1.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 0.96144 to 0.76591, saving model to model/CNNmodel-1.hdf5
Epoch 3/10

Epoch 00003: val_loss improved from 0.76591 to 0.75284, saving model to model/CNNmodel-1.hdf5
Epoch 4/10

Epoch 00004: val_loss did not improve

KeyboardInterrupt: ignored

In [None]:
test_pred=[]
for fold in range(5):
    model.load_weights('/content/model/CNNmodel-{}.hdf5'.format(fold))
    train_idx, valid_idx = folds[fold][0], folds[fold][1]
    X_valid, y_valid = X_train[valid_idx], y_train[valid_idx]
    model.evaluate(X_valid, y_valid)
    test_pred.append(model.predict(X_test))



In [None]:
test_pred=np.array(test_pred)

In [None]:
submit=pd.read_csv('/content/drive/MyDrive/Ybigta/2021-1/NLP 플젝/sample_submission.csv')
submit.iloc[:,1:]=test_pred.mean(0)
submit.to_csv('submission.csv', index=False)

In [None]:
result = pd.read_csv('/content/submission.csv')
result

Unnamed: 0,index,0,1,2,3,4
0,0,0.015195,0.130377,3.574973e-01,0.472548,2.438160e-02
1,1,0.122263,0.363682,1.119062e-01,0.033806,3.683426e-01
2,2,0.998011,0.000988,3.125374e-06,0.000010,9.874888e-04
3,3,0.005375,0.001194,7.714717e-01,0.002054,2.199043e-01
4,4,0.515027,0.196175,4.830802e-02,0.116296,1.241937e-01
...,...,...,...,...,...,...
19612,19612,0.000240,0.999755,6.459565e-08,0.000004,2.211284e-07
19613,19613,0.249659,0.006118,3.114402e-02,0.002739,7.103395e-01
19614,19614,0.000031,0.999967,9.321637e-08,0.000002,2.830702e-07
19615,19615,0.004451,0.994663,6.392672e-05,0.000732,8.991723e-05
