### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt`
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding,Flatten,Dense,GlobalAveragePooling1D,GlobalMaxPooling1D,Bidirectional, LSTM,Conv1D
from keras.models import Sequential
import re
from sklearn.externals import joblib 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from keras.optimizers import RMSprop,sgd

Using TensorFlow backend.


In [2]:
train = pd.read_csv('dataset.csv')

In [3]:
train.head()

Unnamed: 0,ABSTRACT,class
0,we develop the theory of three-dimensional slo...,Physics
1,direction of arrival (doa) approximation of ta...,Statistics
2,let $f$ and $g$ be $1$-bounded multiplicative ...,Mathematics
3,we consider the multidimentional brownian cont...,Mathematics
4,"inside this paper, the general binary-input bi...",Computer Science


### nltk

In [4]:
from nltk.corpus import stopwords  
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer   
import string

In [8]:
token = TweetTokenizer()
stemmer = PorterStemmer()
stopwords_english = stopwords.words('english')

In [173]:
X_train = train['ABSTRACT'][:6400]
X_val = train['ABSTRACT'][6400:]

In [174]:
Y_train = train['class'][:6400]
Y_val = train['class'][6400:]

### Cleaning the train_data

In [154]:
X = X_train.to_numpy()
for i in range(X.shape[0]):
    X[i] = X[i].lower()
X = list(X)
for i in range(len(X)):
    X[i] = re.sub(r'^RT[\s]+', '', X[i])
    X[i] = re.sub(r'https?:\/\/.*[\r\n]*$-', '', X[i])
    X[i] = re.sub(r'#', '', X[i])
for i in range(len(X)):
    X[i] = token.tokenize(X[i])
X_clean = []
for i in range(len(X)):
    new = []
    for word in X[i]:
        if(word not in stopwords_english and word not in string.punctuation):
            new.append(word)
    X_clean.append(new)
    #if(i%100 == 0):
        #print(i)
for i in range(len(X_clean)):
    for j in range(len(X_clean[i])):
        X_clean[i][j] = stemmer.stem(X_clean[i][j])
    if(i%100 == 0):
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300


In [155]:
len(X_clean)

6400

### Creating a Tokenizer

In [156]:
oov_tok = '<OOV>'
trunc_type = 'post'
embedding_dim = 32
max_length = 400

In [157]:
token2 = Tokenizer(oov_token = oov_tok )

In [158]:
token2.fit_on_texts(X_clean)
vocab_size = len(token2.word_index)
vocab_size

30139

In [159]:
sequence = token2.texts_to_sequences(X_clean)
padded = pad_sequences(sequence,maxlen=max_length,truncating = trunc_type)

### Keras Modal

In [160]:
def Model_S():
    model = Sequential([
    Embedding(vocab_size+1,embedding_dim,input_length = max_length),
    GlobalAveragePooling1D(),
    Dense(32,activation = 'relu'),
    Dense(4,activation = 'softmax')])
    model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics=['accuracy'])
    return model

### Label Encoding the target

In [161]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [162]:
le.fit(Y_train)

LabelEncoder()

In [163]:
Y = le.transform(Y_train)

In [164]:
Y

array([2, 3, 1, ..., 0, 1, 2])

In [165]:
list(le.classes_)

['Computer Science', 'Mathematics', 'Physics', 'Statistics']

In [166]:
from keras.utils.np_utils import to_categorical
Y_onehot = to_categorical(Y,num_classes=4)

### Training


In [167]:
Modal = Model_S()

In [168]:
Modal.fit(padded,Y_onehot,epochs=20)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x19e25fa1240>

### Pickling the Modal

In [169]:
from sklearn.externals import joblib    
joblib.dump(Modal, 'text_classification.pkl') 


['text_classification.pkl']

### Giving Predictions for the val set using the Modal

In [175]:
X_test = X_val.to_numpy()
for i in range(X_test.shape[0]):
    X_test[i] = X_test[i].lower()
X_test = list(X_test)
for i in range(len(X_test)):
    X_test[i] = re.sub(r'^RT[\s]+', '', X_test[i])
    X_test[i] = re.sub(r'https?:\/\/.*[\r\n]*$-', '', X_test[i])
    X_test[i] = re.sub(r'#', '', X_test[i])
for i in range(len(X_test)):
    X_test[i] = token.tokenize(X_test[i])
X_test_clean = []
for i in range(len(X_test)):
    new = []
    for word in X_test[i]:
        if(word not in stopwords_english and word not in string.punctuation):
            new.append(word)
    X_test_clean.append(new)
    #if(i%100 == 0):
        #print(i)
for i in range(len(X_test_clean)):
    for j in range(len(X_test_clean[i])):
        X_test_clean[i][j] = stemmer.stem(X_test_clean[i][j])
    if(i%100 == 0):
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


In [176]:
sequence_val = token2.texts_to_sequences(X_test_clean)
padded_val = pad_sequences(sequence_val,maxlen=max_length,truncating = trunc_type)

In [182]:
Prediction_temp = Modal.predict(padded_val)
Prediction = []

In [183]:
for i in range(1600):
    Prediction.append(np.argmax(Prediction_temp[i]))

In [184]:
Prediction = le.inverse_transform(Prediction)

### F1-Score

In [185]:
from sklearn.metrics import f1_score

In [186]:
f1_score(Y_val,Prediction,average='micro')

0.853125

#### We got an F1-Score of 85.3125 on our Validation Set
