In [15]:
import pandas as pd
from scipy.io import loadmat
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from utilities import remove_empty_tweets


In [16]:
import torch

In [17]:
train_data_path = 'cleaned_data/cleaned_train_data_for_subtask1.csv'
test_data_path = 'cleaned_data/cleaned_test_data_for_subtask1.csv'
#read files.
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

print("Train set:"% train_data.columns, train_data.shape, len(train_data)) 
print("Test set:"% test_data.columns, test_data.shape, len(test_data)) 



Train set: (20974, 8) 20974
Test set: (4997, 8) 4997


In [18]:
train_data = remove_empty_tweets(train_data, "#2_tweet_clean_V1")
test = remove_empty_tweets(test_data, "#2_tweet_clean_V1")

train_data.head()

Unnamed: 0,#1_tweetid,#2_tweet,#3_country_label,#2_tweet_clean_V0,#2_tweet_clean_V1,#2_tweet_clean_V2,#2_tweet_clean_V3,#classes_id
0,TRAIN_0,حاجة حلوة اكيد,Egypt,حاجة حلوة اكيد,حاجه حلوه اكيد,حاجه حلوه اكيد,حاجه حلوه اكيد,0
1,TRAIN_1,عم بشتغلوا للشعب الاميركي اما نحن يكذبوا ويغشو...,Iraq,عم بشتغلوا للشعب الاميركي اما نحن يكذبوا ويغشو...,عم بشتغلوا لشعب الاميركي اما نحن يكذبوا ويغشوا...,عم بشتغلوا لشعب الاميركي يكذبوا ويغشوا ويسرقوا...,عم بشتغلوا لشعب الاميركي يكذبوا ويغشوا ويسرقوا...,1
2,TRAIN_2,ابشر طال عمرك,Saudi_Arabia,ابشر طال عمرك,ابشر طال عمرك,ابشر طال عمرك,ابشر طال عمرك,2
3,TRAIN_3,منطق 2017: أنا والغريب علي إبن عمي وأنا والغري...,Mauritania,منطق أنا والغريب علي إبن عمي وأنا والغريب وإب...,منطق انا والغريب علي ابن عمي وانا والغريب وابن...,منطق والغريب ابن عمي وانا والغريب وابن عمي اخو...,منطق والغريب ابن عمي وانا وابن اخويا قطع العلا...,3
4,TRAIN_4,شهرين وتروح والباقي غير صيف ملينا,Algeria,شهرين وتروح والباقي غير صيف ملينا,شهرين وتروح والباقي غير صيف ملينا,شهرين وتروح والباقي صيف ملينا,شهرين وتروح والباقي صيف ملينا,4


In [19]:
#prepare train and test data.
X_train = train_data['#2_tweet_clean_V1'].tolist()
y_train = train_data['#classes_id'].tolist()
X_test = test_data['#2_tweet_clean_V1'].tolist()
y_test = test_data['#classes_id'].tolist()


In [20]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [21]:
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(X_train,y_train,test_size=0.1, random_state=42)

print(len(x_train),len(x_valid))

18876 2098


In [22]:
import random
import copy
import time
import pandas as pd
import numpy as np
import gc
import re
# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from torch.optim.optimizer import Optimizer

from sklearn.preprocessing import StandardScaler
from multiprocessing import  Pool
from functools import partial
import numpy as np
from sklearn.decomposition import PCA
import torch as t
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt


In [23]:
embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 750 # max number of words in a question to use
batch_size = 512 # how many samples to process at once
n_epochs = 5 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits
SEED = 10
debug = 0

In [24]:
class CNN_Text(nn.Module):
    
    def __init__(self):
        super(CNN_Text, self).__init__()
        filter_sizes = [1,2,3,5]
        num_filters = 36
        n_classes = len(le.classes_)
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight.requires_grad = False
        self.convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(len(filter_sizes)*num_filters, n_classes)


    def forward(self, x):
        x = self.embedding(x)  
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        x = self.dropout(x)  
        logit = self.fc1(x) 
        return logit

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(train_sequences, maxlen=maxlen)

validation_sequences = tokenizer.texts_to_sequences(x_valid)
validation_padded = pad_sequences(validation_sequences)



In [27]:
le = LabelEncoder()
le.fit(y_train)
y_train_enc = le.transform(y_train)
y_valid_enc = le.transform(y_valid)


In [28]:
le.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])

In [30]:
import torch.utils.data
n_epochs = 6
model = CNN_Text()
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)

# Load train and test in CUDA Memory
x_train = torch.tensor(train_padded, dtype=torch.long)
y_train = torch.tensor(y_train_enc, dtype=torch.long)
x_cv = torch.tensor(validation_padded, dtype=torch.long)
y_cv = torch.tensor(y_valid_enc, dtype=torch.long)

# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

train_loss = []
valid_loss = []

for epoch in range(n_epochs):
    start_time = time.time()
    # Set model to train configuration
    model.train()
    avg_loss = 0.  
    for i, (x_batch, y_batch) in enumerate(train_loader):
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    
    # Set model to validation configuration -Doesn't get trained here
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(x_cv),len(le.classes_)))
    
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred).cpu().numpy()
    
    # Check Accuracy
    val_accuracy = sum(val_preds.argmax(axis=1)==y_valid_enc)/len(y_valid_enc)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, elapsed_time))



Epoch 1/6 	 loss=1350.6006 	 val_loss=1071.2194  	 val_acc=0.2421  	 time=229.19s
Epoch 2/6 	 loss=1236.6215 	 val_loss=1049.5367  	 val_acc=0.2641  	 time=192.59s
Epoch 3/6 	 loss=1161.3348 	 val_loss=1031.4064  	 val_acc=0.2912  	 time=199.62s
Epoch 4/6 	 loss=1084.4691 	 val_loss=1024.5702  	 val_acc=0.2989  	 time=208.92s
Epoch 5/6 	 loss=1008.8950 	 val_loss=1021.4107  	 val_acc=0.2984  	 time=206.87s
Epoch 6/6 	 loss=937.3877 	 val_loss=1022.5667  	 val_acc=0.3031  	 time=202.99s
