In [22]:
import pandas as pd
import numpy as np
import nltk
from tashaphyne.stemming import ArabicLightStemmer


In [23]:
import utils 

In [24]:
dev_text,dev_category,dev_stance = utils.read_dataset("Dataset/dev.csv")
train_text,train_category,train_stance = utils.read_dataset("Dataset/train.csv")
test_ids,test_text=utils.read_testset("Dataset/test.csv")

In [25]:
train_dev_text =  pd.concat([train_text,dev_text])
train_dev_stance =  pd.concat([train_text,dev_text])
train_dev_category =  pd.concat([train_text,dev_text])

# Choose preprocessing methods
* change any method if needed 

In [26]:
ArListem = ArabicLightStemmer()

cleaner = utils.combine_pipe([utils.remove_urls,utils.remove_lfs,utils.remove_under_scores,utils.remove_user_tag])  
normalizer = ArListem.normalize
tokenizer = ArListem.tokenize

def stemmer(tokens):
    stems=[None]*len(tokens)
    for i,token in enumerate(tokens):
        stems[i]=ArListem.light_stem(token)
    return stems

# def stemmer(tokens):
#     stems=[None]*len(tokens)
#     for i,token in enumerate(tokens):
#         stems[i]=nltk.stem.ARLSTem2().stem(token)
#     return stems

# preprocess = utils.combine_pipe([cleaner,normalizer,tokenizer,stemmer])
preprocess = utils.combine_pipe([cleaner,normalizer,tokenizer])




# Preprocessing

In [27]:
train_preprocessed_text = train_text.apply(preprocess)
dev_preprocessed_text = dev_text.apply(preprocess)


# LSTM

In [28]:
from word_embeddings import embeddings
from lstm import LSTM
from sklearn.metrics import classification_report

train_stance =  train_stance + 1
dev_stance =  dev_stance + 1

lstm = LSTM(
    embeddings = embeddings,
    train_x = train_preprocessed_text.copy(),
    dev_x = dev_preprocessed_text.copy(),
    train_label = train_stance.copy(),
    dev_label = dev_stance.copy(),
    output_size=3,
    learning_rate=0.0001,
    label=1,
    path="model/",
    epochs=1   
)
predictions = lstm.get_predictions()

train_stance = train_stance - 1
dev_stance = dev_stance - 1
print(classification_report(dev_stance, predictions ))





INFO:tensorflow:Assets written to: model/assets


INFO:tensorflow:Assets written to: model/assets


              precision    recall  f1-score   support

          -1       0.15      0.56      0.24        70
           0       0.17      0.08      0.11       126
           1       0.92      0.79      0.85       804

    accuracy                           0.68      1000
   macro avg       0.42      0.47      0.40      1000
weighted avg       0.77      0.68      0.71      1000



In [29]:
map = {
    "advice": 0,
    "celebrity": 1,
    "info_news": 2,
    "others": 3,
    "personal": 4,
    "plan": 5,
    "requests": 6,
    "restrictions": 7,
    "rumors": 8,
    "unrelated": 9
}
train_category_num = train_category.apply(lambda x: map[x])
dev_category_num = dev_category.apply(lambda x: map[x])

In [30]:
lstm = LSTM(
    embeddings = embeddings,
    train_x = train_preprocessed_text.copy(),
    dev_x = dev_preprocessed_text.copy(),
    train_label = train_category_num.copy(),
    dev_label = dev_category_num.copy(),
    output_size=10,
    learning_rate=0.0001,
    label=0,
    path="model2/",
    epochs=1   
)
predictions = lstm.get_predictions()
# print(classification_report(dev_category_num, predictions ))





INFO:tensorflow:Assets written to: model2/assets


INFO:tensorflow:Assets written to: model2/assets




In [31]:
reverse_map = {
    0: "advice",
    1: "celebrity",
    2: "info_news",
    3: "others",
    4: "personal",
    5: "plan",
    6: "requests",
    7: "restrictions",
    8: "rumors",
    9: "unrelated"
}

predictions_list = predictions.tolist()
predictions_list = [reverse_map[x] for x in predictions_list]
# dev_category_list = dev_category.tolist()
# dev_category_list = [reverse_map[x] for x in dev_category_list]

print(classification_report(dev_category, predictions_list ))

              precision    recall  f1-score   support

      advice       0.09      0.50      0.16        10
   celebrity       0.84      0.76      0.80       145
   info_news       0.00      0.00      0.00       545
      others       0.00      0.00      0.00        17
    personal       0.00      0.00      0.00       128
        plan       0.00      0.00      0.00        82
    requests       0.03      0.70      0.07        20
restrictions       0.50      0.50      0.50         2
      rumors       0.04      0.67      0.07        15
   unrelated       0.04      0.14      0.06        36

    accuracy                           0.14      1000
   macro avg       0.15      0.33      0.16      1000
weighted avg       0.13      0.14      0.12      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Build Vocab
* unique words in training set
* remove stopwords as they are not helpful in classification

In [33]:
""" Remove Arabic Stop Words from vocab """
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))

vocab = utils.build_vocab(train_preprocessed_text)
vocab = utils.remove_words_in(vocab,arb_stopwords)

len(vocab)

27529

# Choose Feature extractor 
* we will have more than one feature extractor 
* choose one of them here
* feature extractor takes a preprocessed tweets and return their features

In [34]:
# import freq_feature_extractor 
# feature_extractor = freq_feature_extractor.build_feature_extractor(train_preprocessed_text,train_stance)

# import bog_feature_extractor
# feature_extractor = bog_feature_extractor.build_feature_extractor(train_preprocessed_text)


import tfidf_feature_extractor
feature_extractor = tfidf_feature_extractor.build_feature_extractor(train_preprocessed_text)

# Extract Features

In [35]:


train_X = feature_extractor(train_preprocessed_text)

dev_X = feature_extractor(dev_preprocessed_text)

test_X=feature_extractor(test_text)

# Choose Stance Model 

# SMOTE


In [36]:
from imblearn.over_sampling import SMOTE
train_stance_X   = train_X
train_category_X = train_X

train_stance_Y = train_stance
train_category_Y = train_category

dev_stance_Y = dev_stance
dev_category_Y = dev_category


# smote = SMOTE(random_state=42,)
# train_stance_X , train_stance_Y = smote.fit_resample(train_X, train_stance_Y)


# smote = SMOTE(random_state=42)
# train_category_X , train_category_Y = smote.fit_resample(train_X, train_category_Y)


In [37]:
# from sklearn.preprocessing import StandardScaler,MaxAbsScaler
# from sklearn.pipeline import make_pipeline


# scaler  = make_pipeline(StandardScaler(with_mean=False),MaxAbsScaler())
# train_X = scaler.fit_transform(train_stance_X)
# dev_X   = scaler.transform(dev_X)


from sklearn.naive_bayes import MultinomialNB 
stance_model = MultinomialNB(alpha=0.31)
stance_model = stance_model.fit(train_stance_X, train_stance_Y)


# from sklearn.naive_bayes import GaussianNB 
# stance_model = GaussianNB()
# stance_model = stance_model.fit(train_stance_X.toarray(), train_stance_Y)

# from sklearn.svm import LinearSVC
# stance_model = LinearSVC()
# stance_model = stance_model.fit(train_stance_X, train_stance_Y)



#Choose Category Model

In [38]:
# from sklearn.preprocessing import StandardScaler,MaxAbsScaler
# from sklearn.pipeline import make_pipeline


# scaler  = make_pipeline(StandardScaler(with_mean=False),MaxAbsScaler())

# train_X = scaler.fit_transform(train_X)
# dev_X   = scaler.transform(dev_X)


from sklearn.naive_bayes import MultinomialNB 
category_model = MultinomialNB(alpha=0.31)
category_model = category_model.fit(train_category_X, train_category_Y)


# from sklearn.naive_bayes import GaussianNB 
# category_model = GaussianNB()
# category_model = category_model.fit(train_category_X.toarray(), train_category_Y)

# from sklearn.svm import LinearSVC
# category_model = LinearSVC()
# category_model = category_model.fit(train_category_X, train_category_Y)



# Optimizing Alpha for Multinomial Naive Bayes

In [39]:
from sklearn.metrics import f1_score,classification_report


def get_best_NB(train_X,train_Y,dev_X,dev_Y):
    best_alpha = 0
    max_f1 = 0
    best_model = None

    alpha=.001
    while alpha<1:
        model = MultinomialNB(alpha=alpha)
        model = model.fit(train_X, train_Y)
        predicted_y = model.predict(dev_X)
        
        f1 = f1_score(dev_Y,predicted_y,average='macro')
        
        if f1 > max_f1:
            max_f1     = f1
            best_alpha = alpha
            best_model = model
        alpha+=.001
    print(f"max f1= {max_f1} @ alpha= {best_alpha}")
    print(classification_report(best_model.predict(dev_X),dev_Y))

    return best_model

get_best_NB(train_stance_X,train_stance_Y,dev_X,dev_stance_Y)
get_best_NB(train_category_X,train_category_Y,dev_X,dev_category_Y)

max f1= 0.5642371426685152 @ alpha= 0.016000000000000007
              precision    recall  f1-score   support

          -1       0.33      0.61      0.43        38
           0       0.30      0.46      0.36        83
           1       0.95      0.86      0.90       879

    accuracy                           0.82      1000
   macro avg       0.53      0.64      0.56      1000
weighted avg       0.87      0.82      0.84      1000

max f1= 0.32194378665710416 @ alpha= 0.005
              precision    recall  f1-score   support

      advice       0.20      0.67      0.31         3
   celebrity       0.79      0.87      0.83       132
   info_news       0.82      0.71      0.76       632
      others       0.06      0.12      0.08         8
    personal       0.57      0.55      0.56       132
        plan       0.09      0.15      0.11        46
    requests       0.10      0.12      0.11        16
restrictions       0.00      0.00      0.00         0
      rumors       0.07      0.1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Test Stance Model

In [40]:

from sklearn.metrics import classification_report,f1_score
print(classification_report(dev_stance_Y,stance_model.predict(dev_X.toarray())))

              precision    recall  f1-score   support

          -1       0.40      0.03      0.05        70
           0       0.65      0.13      0.22       126
           1       0.82      0.99      0.90       804

    accuracy                           0.81      1000
   macro avg       0.62      0.38      0.39      1000
weighted avg       0.77      0.81      0.75      1000



#Test category Model


In [41]:

from sklearn.metrics import classification_report
print(classification_report(dev_category_Y,category_model.predict(dev_X.toarray())))

              precision    recall  f1-score   support

      advice       0.00      0.00      0.00        10
   celebrity       0.90      0.77      0.83       145
   info_news       0.66      0.96      0.78       545
      others       0.00      0.00      0.00        17
    personal       0.67      0.37      0.47       128
        plan       0.00      0.00      0.00        82
    requests       0.00      0.00      0.00        20
restrictions       0.00      0.00      0.00         2
      rumors       0.00      0.00      0.00        15
   unrelated       0.73      0.22      0.34        36

    accuracy                           0.69      1000
   macro avg       0.30      0.23      0.24      1000
weighted avg       0.60      0.69      0.62      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
test_stance = stance_model.predict(test_X)
test_category = category_model.predict(test_X)

In [43]:


utils.write_test_file(test_ids,test_category,test_stance)

Index(['id', 'category', 'stance'], dtype='object')


# Models Accuracies:
    * Frequency Feature with GaussianNB --> 79.5
    * Frequency Feature with SVM --> 77.3
    * BOG with MultinomialNB --> 81.9 (without stemming)
    * BOG with SVM --> 80.9
    * TFIDF with MultinomialNB --> 80.9
    * TFIDF with SVM --> 82.0
    * TFIDF -> SMOTE -> NAIVE BAYES (alpha=.31) --> fscore = 0.6  acc = 79% 