In [1]:
# !pip install fasttext
# Word Embedding using FastText
import fasttext.util
fasttext.util.download_model('ar', if_exists='ignore')  
ft = fasttext.load_model('cc.ar.300.bin')



In [2]:
print(ft.get_dimension())
# reduce the dimension of the word embedding from 300 to 100
fasttext.util.reduce_model(ft, 100)
print(ft.get_dimension())

300
100


In [3]:
print(ft.get_word_vector('الشوق شوق')) 
print(ft.get_word_vector('الشوق شوق').shape)

[-0.01018494 -0.08594008 -0.03731866 -0.00024127  0.04777642 -0.00496229
  0.06186691  0.02239722 -0.0280445   0.04849716 -0.00795461 -0.02634357
  0.01323329 -0.02175279  0.03655794 -0.01601862 -0.01084875  0.01393491
 -0.03315404  0.0335445   0.01913922 -0.02036594 -0.00192303 -0.00421568
  0.01183395  0.01175373 -0.07322915  0.08777917  0.00460946 -0.05397879
 -0.04420993 -0.01738087 -0.01531424 -0.02648391  0.05191071 -0.06218194
  0.03248445 -0.03084662  0.02021079 -0.03369771 -0.01912928  0.00776838
 -0.02871501  0.00045804  0.01323607  0.01341687  0.03769046 -0.00100606
  0.03073437 -0.02756213  0.02987276  0.00607357 -0.0034104   0.00598877
 -0.04017798 -0.02268266  0.04346691 -0.00163789  0.03622608  0.00370311
 -0.03838527  0.01520885  0.00647131 -0.02734641  0.01213152  0.03940092
  0.04791509 -0.01371275 -0.00985732 -0.00985206  0.04072648 -0.01279194
  0.02949059  0.0199767   0.01000008  0.0091176  -0.03566917  0.03153419
 -0.05023986 -0.02361483  0.04186894  0.01141178 -0

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN, Dropout
from tensorflow.keras.optimizers import Adam
from scipy.stats import skew
import matplotlib.pyplot as plt
import numpy as np

In [5]:
df = pd.read_csv('./Dataset/cleaned_train.csv')
print(df.shape)
df.head()

(6988, 3)


Unnamed: 0,text,category,stance
0,بيل غيتس تلقي لقاح كوفيد19 من غير تصوير ابر و ...,celebrity,1
1,وزير صح حد يوم تحديد هل بمؤتمروا صحفي كان ما ع...,info_news,1
2,قول رح يكونو اد مسؤولي ب لبنان ما وصل لقاح ؟ ا...,info_news,1
3,تركيا . . وزير صح فخر دين قوجة تلقي اول جرع من...,celebrity,1
4,وئام وهاب شتم دول خليجي في كل طل اعلامي تسافه ...,personal,0


In [6]:
MAX_TWEET_LENGTH = -1
for tweet in df['text']:
  tweet_arr = tweet.split(' ')
  MAX_TWEET_LENGTH = max(MAX_TWEET_LENGTH, len(tweet_arr))
print(MAX_TWEET_LENGTH)
print(pd.Series([len(x.split(' ')) for x in df['text']]).describe())

137
count    6988.000000
mean       25.166714
std        14.403391
min         2.000000
25%        14.000000
50%        20.000000
75%        36.000000
max       137.000000
dtype: float64


In [7]:
embedded_text = np.zeros((len(df['text']),MAX_TWEET_LENGTH*100))
print(embedded_text.shape)
for i,tweet in enumerate(df['text']):
  sentence_embedding = np.array([[]])
  for word in tweet.split(" "):
    sentence_embedding = np.append(sentence_embedding, ft.get_word_vector(word))
  sentence_embedding.resize(MAX_TWEET_LENGTH*100)
  embedded_text[i] = sentence_embedding

(6988, 13700)


In [24]:
x_train, x_test, y_train_stance, y_test_stance, y_train_cat, y_test_cat = train_test_split(embedded_text, df['stance'], df['category'], test_size=0.3, random_state=42,stratify=df['stance'])

print(x_train.shape)
print(x_test.shape)
print(y_train_stance.shape)
print(y_test_stance.shape)
print(y_train_cat.shape)
print(y_test_cat.shape)
print(y_train_stance.value_counts())
print(y_test_stance.value_counts())
print(y_train_cat.value_counts())
print(y_test_cat.value_counts())

(4891, 13700)
(2097, 13700)
(4891,)
(2097,)
(4891,)
(2097,)
 1    3876
 0     708
-1     307
Name: stance, dtype: int64
 1    1662
 0     304
-1     131
Name: stance, dtype: int64
info_news       2542
personal         704
celebrity        669
plan             434
unrelated        216
others           132
requests          82
rumors            55
advice            47
restrictions      10
Name: category, dtype: int64
info_news       1074
personal         321
celebrity        306
plan             172
unrelated        107
others            35
requests          30
rumors            24
advice            20
restrictions       8
Name: category, dtype: int64


In [9]:
# PCA 
# x_train = PCA(n_components=100).fit_transform(x_train)
# PCA_test = PCA(n_components=100).fit(x_test_stance)
# x_test_stance = PCA_test.transform(x_test_stance)

# SMOTE Oversampling

In [25]:
# Apply SMOTE oversampling to the training data
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
x_train_stance, y_train_stance = sm.fit_resample(x_train, y_train_stance)
x_train_cat, y_train_cat = sm.fit_resample(x_train, y_train_cat)
print(x_train.shape)
print(y_train_stance.value_counts())
print(y_train_cat.value_counts())

(4891, 13700)
 0    3876
 1    3876
-1    3876
Name: stance, dtype: int64
celebrity       2542
info_news       2542
plan            2542
requests        2542
unrelated       2542
rumors          2542
personal        2542
advice          2542
restrictions    2542
others          2542
Name: category, dtype: int64


In [26]:
y_train_stance = np.array(y_train_stance)
y_test_stance = np.array(y_test_stance)
y_train_cat = np.array(y_train_cat)
y_test_cat = np.array(y_test_cat)

# Random Forest

In [12]:
# Random Forest on stance
clf = RandomForestClassifier(n_estimators=100, random_state=2002)
clf.fit(x_train_stance, y_train_stance)
y_pred = clf.predict(x_test)
print(classification_report(y_test_stance, y_pred))

              precision    recall  f1-score   support

          -1       0.44      0.09      0.15       131
           0       0.60      0.12      0.20       304
           1       0.81      0.98      0.89      1662

    accuracy                           0.80      2097
   macro avg       0.62      0.40      0.41      2097
weighted avg       0.76      0.80      0.74      2097



In [13]:
# Random Forest on category
clf = RandomForestClassifier(n_estimators=100, random_state=2002)
clf.fit(x_train_cat, y_train_cat)
y_pred = clf.predict(x_test)
print(classification_report(y_test_cat, y_pred))

              precision    recall  f1-score   support

      advice       0.00      0.00      0.00        20
   celebrity       0.76      0.54      0.63       306
   info_news       0.57      0.84      0.68      1074
      others       0.16      0.09      0.11        35
    personal       0.41      0.21      0.28       321
        plan       0.19      0.08      0.11       172
    requests       0.32      0.23      0.27        30
restrictions       0.00      0.00      0.00         8
      rumors       0.00      0.00      0.00        24
   unrelated       0.46      0.12      0.19       107

    accuracy                           0.56      2097
   macro avg       0.29      0.21      0.23      2097
weighted avg       0.51      0.56      0.51      2097



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SVM

In [14]:
# SVM on stance
clf = svm.SVC(kernel='linear', C=1.0, probability=True)
clf.fit(x_train_stance, y_train_stance)
y_pred = clf.predict(x_test)
print(classification_report(y_test_stance, y_pred))

              precision    recall  f1-score   support

          -1       0.21      0.29      0.24       131
           0       0.32      0.34      0.33       304
           1       0.86      0.82      0.84      1662

    accuracy                           0.72      2097
   macro avg       0.46      0.48      0.47      2097
weighted avg       0.74      0.72      0.73      2097



In [15]:
# SVM on category
clf = svm.SVC(kernel='linear', C=1.0, probability=True)
clf.fit(x_train_cat, y_train_cat)
y_pred = clf.predict(x_test)
print(classification_report(y_test_cat, y_pred))

              precision    recall  f1-score   support

      advice       0.00      0.00      0.00        20
   celebrity       0.59      0.65      0.62       306
   info_news       0.64      0.67      0.65      1074
      others       0.07      0.09      0.07        35
    personal       0.46      0.41      0.43       321
        plan       0.20      0.21      0.21       172
    requests       0.17      0.23      0.19        30
restrictions       0.00      0.00      0.00         8
      rumors       0.08      0.04      0.05        24
   unrelated       0.27      0.17      0.21       107

    accuracy                           0.53      2097
   macro avg       0.25      0.25      0.24      2097
weighted avg       0.52      0.53      0.52      2097



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
x_train_stance = x_train_stance.reshape(x_train_stance.shape[0],MAX_TWEET_LENGTH,100)
x_train_cat = x_train_cat.reshape(x_train_cat.shape[0],MAX_TWEET_LENGTH,100)
x_test = x_test.reshape(x_test.shape[0],MAX_TWEET_LENGTH,100)
print(x_train_stance.shape)
print(x_train_cat.shape)
print(x_test.shape)

(4891, 137, 100)
(2097, 137, 100)


In [31]:
y_train_stance = y_train_stance + 1
# squeeze the last dimension of y_train
y_train_stance = np.squeeze(y_train_stance)
print(y_train_stance[0])
print(y_train_stance.shape)
y_train_stance = to_categorical(y_train_stance, 3)
print(y_train_stance.shape)

2
(11628,)
(11628, 3)


In [32]:
def map_category_to_int(category):
    y_cat = np.zeros(category.shape)
    # advice = 0, celebrity = 1, info_news = 2, others = 3, personal = 4, plan = 5, requests = 6, restrictions = 7, rumors = 8, unrelated = 9
    for i,cat in enumerate(category):
        if cat == 'advice':
            y_cat[i] = 0
        elif cat == 'celebrity':
            y_cat[i] = 1
        elif cat == 'info_news':
            y_cat[i] = 2
        elif cat == 'others':
            y_cat[i] = 3
        elif cat == 'personal':
            y_cat[i] = 4
        elif cat == 'plan':
            y_cat[i] = 5
        elif cat == 'requests':
            y_cat[i] = 6
        elif cat == 'restrictions':
            y_cat[i] = 7
        elif cat == 'rumors':
            y_cat[i] = 8
        elif cat == 'unrelated':
            y_cat[i] = 9
    return y_cat


print(y_train_cat[0])
y_train_cat_int = map_category_to_int(y_train_cat)
print(y_train_cat_int[0])
y_train_cat_int = to_categorical(y_train_cat_int, 10)
print(y_train_cat_int.shape)

info_news
2.0
(25420, 10)


In [33]:
def map_int_to_category(y_cat):
    category = []
    for i,cat in enumerate(y_cat):
        if cat == 0:
            category.append('advice')
        elif cat == 1:
            category.append('celebrity')
        elif cat == 2:
            category.append('info_news')
        elif cat == 3:
            category.append('others')
        elif cat == 4:
            category.append('personal')
        elif cat == 5:
            category.append('plan')
        elif cat == 6:
            category.append('requests')
        elif cat == 7:
            category.append('restrictions')
        elif cat == 8:
            category.append('rumors')
        elif cat == 9:
            category.append('unrelated')
    return category

In [49]:
print(x_train_stance.shape)
print(y_train_stance.shape)
print(x_train_cat.shape)
print(y_train_cat.shape)
print(y_train_cat_int.shape)
print(x_test.shape)
print(y_test_stance.shape)
print(y_test_cat.shape)

(11628, 137, 100)
(11628, 3)
(25420, 137, 100)
(25420,)
(25420, 10)
(2097, 137, 100)
(2097,)
(2097,)


# RNN

In [34]:
# RNN model for stance
model1_stance = Sequential()
# model.add(Input(shape=(137,100)))
model1_stance.add(SimpleRNN(units = 100,input_shape=(MAX_TWEET_LENGTH,100),return_sequences=True))
model1_stance.add(Dropout(0.2))
model1_stance.add(SimpleRNN(units = 32))
model1_stance.add(Dropout(0.2))
model1_stance.add(Dense(3, activation='softmax'))
print(model1_stance.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_2 (SimpleRNN)    (None, 137, 100)          20100     
                                                                 
 dropout_2 (Dropout)         (None, 137, 100)          0         
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 32)                4256      
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 99        
                                                                 
Total params: 24,455
Trainable params: 24,455
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
opt = Adam(learning_rate=0.0001)
model1_stance.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=opt, metrics=['accuracy', 'Precision', 'Recall'])
model1_stance.fit(x_train_stance, y_train_stance, epochs=12, batch_size=32, verbose=1)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x201ed50a1f0>

In [1]:
y_pred = model1_stance.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
y_pred = y_pred - 1
print(classification_report(y_test_stance, y_pred))

NameError: name 'model1_stance' is not defined

In [37]:
# RNN model for category
model1_cat = Sequential()
# model.add(Input(shape=(137,100)))
model1_cat.add(SimpleRNN(units = 100,input_shape=(MAX_TWEET_LENGTH,100),return_sequences=True))
model1_cat.add(Dropout(0.2))
model1_cat.add(SimpleRNN(units = 32))
model1_cat.add(Dropout(0.2))
model1_cat.add(Dense(10, activation='softmax'))
print(model1_cat.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_4 (SimpleRNN)    (None, 137, 100)          20100     
                                                                 
 dropout_4 (Dropout)         (None, 137, 100)          0         
                                                                 
 simple_rnn_5 (SimpleRNN)    (None, 32)                4256      
                                                                 
 dropout_5 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 10)                330       
                                                                 
Total params: 24,686
Trainable params: 24,686
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
opt = Adam(learning_rate=0.0001)
model1_cat.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=opt, metrics=['accuracy', 'Precision', 'Recall'])
model1_cat.fit(x_train_cat, y_train_cat_int, epochs=12, batch_size=32, verbose=1)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x201f47e3310>

In [39]:
y_pred = model1_cat.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
y_pred = map_int_to_category(y_pred)
print(classification_report(y_test_cat, y_pred))

              precision    recall  f1-score   support

      advice       0.04      0.10      0.06        20
   celebrity       0.64      0.52      0.57       306
   info_news       0.63      0.38      0.48      1074
      others       0.06      0.23      0.10        35
    personal       0.28      0.24      0.26       321
        plan       0.17      0.34      0.22       172
    requests       0.07      0.23      0.11        30
restrictions       0.00      0.00      0.00         8
      rumors       0.02      0.08      0.04        24
   unrelated       0.12      0.21      0.15       107

    accuracy                           0.36      2097
   macro avg       0.20      0.23      0.20      2097
weighted avg       0.48      0.36      0.40      2097



# LSTM

In [52]:
model2_stance = Sequential()
# model.add(Input(shape=(137,100)))
model2_stance.add(LSTM(units = 100,input_shape=(137,100),return_sequences=True))
model2_stance.add(Dropout(0.2))
model2_stance.add(LSTM(units = 32))
model2_stance.add(Dropout(0.2))
model2_stance.add(Dense(3, activation='softmax'))
print(model2_stance.summary())

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, 137, 100)          80400     
                                                                 
 dropout_14 (Dropout)        (None, 137, 100)          0         
                                                                 
 lstm_9 (LSTM)               (None, 32)                17024     
                                                                 
 dropout_15 (Dropout)        (None, 32)                0         
                                                                 
 dense_7 (Dense)             (None, 3)                 99        
                                                                 
Total params: 97,523
Trainable params: 97,523
Non-trainable params: 0
_________________________________________________________________
None


In [53]:
opt = Adam(learning_rate=0.001)
model2_stance.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=opt, metrics=['accuracy', 'Precision', 'Recall'])
model2_stance.fit(x_train_stance, y_train_stance, epochs=12, batch_size=32, verbose=2)

Epoch 1/12
364/364 - 40s - loss: 1.0991 - accuracy: 0.3431 - precision: 0.0000e+00 - recall: 0.0000e+00 - 40s/epoch - 111ms/step
Epoch 2/12
364/364 - 39s - loss: 1.0993 - accuracy: 0.3295 - precision: 0.0000e+00 - recall: 0.0000e+00 - 39s/epoch - 107ms/step
Epoch 3/12
364/364 - 36s - loss: 1.0992 - accuracy: 0.3216 - precision: 0.0000e+00 - recall: 0.0000e+00 - 36s/epoch - 99ms/step
Epoch 4/12
364/364 - 36s - loss: 1.0990 - accuracy: 0.3311 - precision: 0.0000e+00 - recall: 0.0000e+00 - 36s/epoch - 99ms/step
Epoch 5/12
364/364 - 38s - loss: 1.0990 - accuracy: 0.3303 - precision: 0.0000e+00 - recall: 0.0000e+00 - 38s/epoch - 105ms/step
Epoch 6/12
364/364 - 37s - loss: 1.0987 - accuracy: 0.3399 - precision: 0.0000e+00 - recall: 0.0000e+00 - 37s/epoch - 103ms/step
Epoch 7/12
364/364 - 36s - loss: 1.0991 - accuracy: 0.3311 - precision: 1.0000 - recall: 8.5999e-05 - 36s/epoch - 100ms/step
Epoch 8/12
364/364 - 38s - loss: 1.0987 - accuracy: 0.3339 - precision: 1.0000 - recall: 8.5999e-05 - 3

<keras.callbacks.History at 0x201fc963c10>

In [42]:
y_pred = model2_stance.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
y_pred = y_pred - 1
print(classification_report(y_test_stance, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       131
           0       0.14      1.00      0.25       304
           1       0.50      0.00      0.00      1662

    accuracy                           0.14      2097
   macro avg       0.21      0.33      0.08      2097
weighted avg       0.42      0.14      0.04      2097



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
model2_cat = Sequential()
# model.add(Input(shape=(137,100)))
model2_cat.add(LSTM(units = 100,input_shape=(137,100),return_sequences=True))
model2_cat.add(Dropout(0.2))
model2_cat.add(LSTM(units = 32))
model2_cat.add(Dropout(0.2))
model2_cat.add(Dense(10, activation='softmax'))
print(model2_cat.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 137, 100)          80400     
                                                                 
 dropout_10 (Dropout)        (None, 137, 100)          0         
                                                                 
 lstm_5 (LSTM)               (None, 32)                17024     
                                                                 
 dropout_11 (Dropout)        (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 10)                330       
                                                                 
Total params: 97,754
Trainable params: 97,754
Non-trainable params: 0
_________________________________________________________________
None


In [46]:
opt = Adam(learning_rate=0.0001)
model2_cat.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=opt, metrics=['accuracy', 'Precision', 'Recall'])
model2_cat.fit(x_train_cat, y_train_cat_int, epochs=12, batch_size=32, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x201f4638ca0>

In [47]:
y_pred = model2_cat.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
y_pred = map_int_to_category(y_pred)
print(classification_report(y_test_cat, y_pred))

              precision    recall  f1-score   support

      advice       0.01      0.15      0.02        20
   celebrity       0.51      0.63      0.56       306
   info_news       0.00      0.00      0.00      1074
      others       0.04      0.09      0.05        35
    personal       0.00      0.00      0.00       321
        plan       0.14      0.65      0.23       172
    requests       0.05      0.43      0.08        30
restrictions       0.00      0.00      0.00         8
      rumors       0.03      0.21      0.05        24
   unrelated       0.26      0.18      0.21       107

    accuracy                           0.17      2097
   macro avg       0.10      0.23      0.12      2097
weighted avg       0.10      0.17      0.11      2097



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
