In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
import gensim

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, classification_report, f1_score



#text is already cleaned.
#assign cleaned data to these variables.
train_data_path = 'cleaned_data/cleaned_train_data_for_subtask1.csv'
test_data_path = 'cleaned_data/cleaned_test_data_for_subtask1.csv'
#read files.
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

print("Train set:"% train_data.columns, train_data.shape, len(train_data)) 
print("Test set:"% test_data.columns, test_data.shape, len(test_data)) 





Train set: (20974, 8) 20974
Test set: (4997, 8) 4997


In [10]:
#prepare train and test data.
X_train = train_data['#2_tweet_clean_V0']
y_train = train_data['#classes_id']
X_test = test_data['#2_tweet_clean_V0']
y_test = test_data['#classes_id']

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer

max_no = 50000
sql_len = 250
dim = 100

tokenizer = Tokenizer(num_words=max_no, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=False)
tokenizer.fit_on_texts(X_train.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

tokenizer = Tokenizer(num_words=max_no, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=False)
tokenizer.fit_on_texts(X_test.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 56910 unique tokens.
Found 19441 unique tokens.


In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


X_tr = tokenizer.texts_to_sequences(X_train.values)
X_tr = pad_sequences(X_tr, maxlen=sql_len)
print('Shape of data tensor:', X_tr.shape)

X_te = tokenizer.texts_to_sequences(X_test.values)
X_te = pad_sequences(X_te, maxlen=sql_len)
print('Shape of data tensor:', X_te.shape)


Shape of data tensor: (20974, 250)
Shape of data tensor: (4997, 250)


In [13]:
Y_tr = pd.get_dummies(y_train).values
print('Shape of label tensor:', Y_tr.shape)

Y_te = pd.get_dummies(y_test).values
print('Shape of label tensor:', Y_te.shape)

Shape of label tensor: (20974, 21)
Shape of label tensor: (4997, 21)


In [14]:
import keras
import tensorflow
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
#import tensorsflow_addons as tfa
model = keras.Sequential()
model.add(Embedding(max_no, dim, input_shape=(sql_len,)))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(21, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 20
batch_size = 64

history = model.fit(X_tr, Y_tr, epochs=epochs, batch_size=batch_size,validation_split=0.1)


Train on 18876 samples, validate on 2098 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [32]:
y_pred= model.predict(X_te)
y_pred = np.argmax(y_pred, axis=1)
y_test = test_data['#classes_id']
Y_te = pd.get_dummies(y_test).values

Y_te = np.argmax(Y_te, axis=1)
print(f1_score(Y_te, y_pred, average='macro'))


0.16382330864112168


In [33]:
target_names = list(train_data["#3_country_label"].unique())
print(classification_report(Y_te, y_pred,target_names=target_names))

                      precision    recall  f1-score   support

               Egypt       0.55      0.59      0.57      1041
                Iraq       0.41      0.41      0.41       663
        Saudi_Arabia       0.23      0.32      0.27       519
          Mauritania       0.36      0.28      0.32        53
             Algeria       0.39      0.37      0.38       430
               Syria       0.11      0.13      0.12       278
                Oman       0.20      0.15      0.17       355
             Tunisia       0.19      0.16      0.17       172
             Lebanon       0.09      0.08      0.08       157
             Morocco       0.14      0.17      0.16       207
            Djibouti       0.00      0.00      0.00        27
United_Arab_Emirates       0.10      0.12      0.11       157
              Kuwait       0.06      0.05      0.05       105
               Libya       0.32      0.23      0.26       314
             Bahrain       0.00      0.00      0.00        52
       