In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Conv1D,MaxPooling1D
from keras.layers import LSTM,Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint
import pandas as pd
# fix random seed for reproducibility
np.random.seed(7)
from prettytable import PrettyTable
import warnings
warnings.filterwarnings('ignore')

import re
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load('en_core_web_sm')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/locus.csv")
df_valid = pd.read_csv("/content/drive/MyDrive/validation.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")
train = df_train.append(df_valid)

In [None]:
X_train = train['abstract']
y_train = train['category_num']
X_test = test['abstract']

In [None]:
y_train

0        138
1         68
2          7
3         93
4         76
        ... 
48819     81
48820    120
48821    150
48822    150
48823    150
Name: category_num, Length: 439427, dtype: int64

In [None]:
y_train.shape

(439427,)

In [None]:
ps = PorterStemmer()

In [None]:
def clean_abstract(text):
  text = re.sub('[^a-z\s]', ' ', text.lower())
  text = [i.lower() for i in text.split() if i not in nlp.Defaults.stop_words]
  text = [ps.stem(i) for i in text]
  text = ' '.join(text)
  text.replace('\n',' ')
  return text

In [None]:
X_train = X_train.apply(clean_abstract)

In [None]:
X_test = X_test.apply(clean_abstract)

In [None]:
trainn = X_train
testt = X_test

In [None]:
from keras.preprocessing.text import Tokenizer
## Tokenize the sentences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(list(X_train)+list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
import numpy as np
X_train = np.array(X_train)
X_test = np.array(X_test)

In [None]:
maxlen = max([len(x) for x in X_train])
maxlen

427

In [None]:
# truncate and pad input sequences
X_train = sequence.pad_sequences(X_train, maxlen= maxlen)
X_test = sequence.pad_sequences(X_test, maxlen= maxlen)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [None]:
def sampling_strategy(y,n_samples, t='majority'):
    target_classes = ''
    if t == 'majority':
        target_classes = {k:v for k,v in dict(Counter(y)).items() if v > n_samples}
    elif t == 'minority':
        # Since SMOTE depends on KNN, filtering out those samples that occur less than 10 times
        target_classes = {k:v for k,v in dict(Counter(y)).items() if v < n_samples and v > 10}
    sampling_strategy = {k:n_samples for k,v in target_classes.items()}
    return sampling_strategy

In [None]:
over_sampler_ss = sampling_strategy(y_train,1000,t='minority')

In [None]:
over = SMOTE(sampling_strategy=over_sampler_ss)

In [None]:
X_over, y_over = over.fit_resample(X_train, y_train)

In [None]:
y_train = pd.get_dummies(y_over).values

In [None]:
y_train.shape

(472774, 156)

In [None]:
X_train,X_cv,y_train,y_cv = train_test_split(X_over,y_train,test_size = 0.2)
print("Shape of train data:", X_train.shape)
print("Shape of CV data:", X_cv.shape)

Shape of train data: (378219, 427)
Shape of CV data: (94555, 427)


In [None]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(10000, embedding_vecor_length, input_length=maxlen))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(500))
model.add(Dense(156, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[f1])
print(model.summary())
filepath="weights_best_cnn.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train, y_train, epochs=5, batch_size=64,verbose = 1,callbacks = callbacks_list,validation_data=(X_cv,y_cv))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 427, 32)           320000    
                                                                 
 conv1d (Conv1D)             (None, 427, 32)           3104      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 213, 32)          0         
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 500)               1066000   
                                                                 
 dense (Dense)               (None, 156)               78156     
                                                                 
Total params: 1,467,260
Trainable params: 1,467,260
Non-trainable params: 0
______________________________________________

KeyboardInterrupt: ignored

In [None]:
prediction = model.predict(X_test)

In [None]:
prediction

array([[2.3000284e-06, 2.3889608e-05, 1.8244627e-01, ..., 3.3910893e-02,
        4.2504829e-01, 3.9570006e-03],
       [5.6671201e-05, 5.9682275e-03, 1.4692820e-03, ..., 8.8988086e-03,
        3.8513979e-01, 3.1248510e-02],
       [2.8948955e-06, 3.7149820e-02, 8.2325751e-01, ..., 1.0596532e-02,
        3.1924501e-02, 1.5263393e-03],
       ...,
       [2.2006946e-06, 1.6966664e-04, 9.7104441e-03, ..., 3.3365648e-02,
        2.0316435e-02, 1.5564464e-03],
       [1.0280676e-05, 6.9941903e-05, 3.2041743e-02, ..., 1.7834651e-01,
        9.6777278e-01, 2.6473846e-02],
       [1.4367230e-07, 4.9534824e-04, 2.2479617e-03, ..., 2.3761053e-01,
        9.1684419e-01, 3.4342799e-02]], dtype=float32)

In [None]:
y_classes = prediction.argmax(axis=-1)

In [None]:
y_classes

array([ 25,  54, 108, ..., 110,  25,  40])

In [None]:
output = pd.DataFrame({'id': test.id,
                       'category_num': y_classes})
output.head()
output.to_csv('solution.csv', index=False)

In [None]:
output.head()

Unnamed: 0,id,category_num
0,430065,25
1,75226,54
2,301990,108
3,301001,118
4,280179,40
