In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
import gensim

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
from sklearn.metrics import classification_report

from utilities import remove_empty_tweets
from sklearn.metrics import classification_report, f1_score


#text is already cleaned.
#assign cleaned data to these variables.
train_data_path = 'cleaned_data/cleaned_train_data_for_subtask1.csv'
test_data_path = 'cleaned_data/cleaned_test_data_for_subtask1.csv'
#read files.
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

print("Train set:"% train_data.columns, train_data.shape, len(train_data)) 
print("Test set:"% test_data.columns, test_data.shape, len(test_data)) 





Train set: (20974, 8) 20974
Test set: (4997, 8) 4997


In [2]:
#remove empty tweets.
train_data = remove_empty_tweets(train_data, "#2_tweet_clean_V1")
test = remove_empty_tweets(test_data, "#2_tweet_clean_V1")

In [3]:
#prepare train and test data.
X_train = train_data['#2_tweet_clean_V1'].tolist()
y_train = train_data['#classes_id'].tolist()
X_test = test_data['#2_tweet_clean_V1'].tolist()
y_test = test_data['#classes_id'].tolist()


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#tweets tokenize
max_no = 80000 #max common words
sql_len = 500 #max length
dim = 64

tokenizer = Tokenizer(lower=False, num_words=max_no)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
#padding
X_tr = tokenizer.texts_to_sequences(X_train)
X_tr = pad_sequences(X_tr, maxlen=sql_len)
print('Shape of data tensor:', X_tr.shape)

X_te = tokenizer.texts_to_sequences(X_test)
X_te = pad_sequences(X_te, maxlen=sql_len)
print('Shape of data tensor:', X_te.shape)

Found 49126 unique tokens.
Shape of data tensor: (20974, 500)
Shape of data tensor: (4997, 500)


In [None]:
import keras
import tensorflow as tf
from keras.optimizers import adam, SGD
from keras.layers import Embedding, GlobalAveragePooling1D, SpatialDropout1D, LSTM, Dense, Dropout, Bidirectional
import keras
import tensorflow
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from keras.callbacks import EarlyStopping

vocab_size = 50000
embedding_dim = 128

adam_opt = adam(lr=0.001, decay=1e-6)
sdg_opt = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)

model = keras.Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_shape=X_tr.shape[1:]))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(embedding_dim, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(21,activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=adam_opt, metrics=['accuracy'])

model.summary()

epochs = 10
batch_size = 256

from keras.utils.np_utils import to_categorical
Y_tr = to_categorical(y_train)
Y_te = to_categorical(y_test)

history = model.fit(X_tr, Y_tr, epochs=epochs, batch_size=batch_size,validation_split=0.2, callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])


Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 128)          6400000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 21)                2709      
Total params: 6,698,773
Trainable params: 6,698,773
Non-trainable params: 0
____________________________________________

In [None]:
y_pred= model.predict(X_te)
y_pred = np.argmax(y_pred, axis=1)
y_test = test_data['#classes_id']
Y_te = pd.get_dummies(y_test).values

Y_te = np.argmax(Y_te, axis=1)
print(f1_score(Y_te, y_pred, average='macro'))