In [15]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

%matplotlib inline
import unicodedata

In [16]:
pd.set_option('display.max_colwidth', 280)
train_neg = pd.read_csv("../input/arabic-sentiment-twitter-corpus/train_Arabic_tweets_negative_20190413.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
train_neg.rename(columns={0:'label', 1:'tweet'}, inplace=True)
train_neg['label'] = 0

train_pos = pd.read_csv("../input/arabic-sentiment-twitter-corpus/train_Arabic_tweets_positive_20190413.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
train_pos.rename(columns={0:'label', 1:'tweet'}, inplace=True)
train_pos['label'] = 1


train_df = pd.concat([train_neg, train_pos], axis=0).reset_index(drop=True)


# from sklearn.model_selection import train_test_split
# X = train_df.tweet.values
# y = train_df.label.values

# # The train val split is used by the DL approach but not classical ML
# X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1, random_state=2020)
# Load test subset
test_pos = pd.read_csv("../input/arabic-sentiment-twitter-corpus/test_Arabic_tweets_positive_20190413.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
test_pos.rename(columns={0:'label', 1:'tweet'}, inplace=True)
test_pos['label']=1

test_neg = pd.read_csv("../input/arabic-sentiment-twitter-corpus/test_Arabic_tweets_negative_20190413.tsv", sep="\t", header=None,  quoting=csv.QUOTE_NONE)
test_neg.rename(columns={0:'label', 1:'tweet'}, inplace=True)
test_neg['label']=0

test_df = pd.concat([test_neg, test_pos], axis=0).reset_index(drop=True)
X_test = test_df.tweet.values
y_test = test_df.label.values

In [17]:
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
  

    # Normalize unicode encoding
    text = unicodedata.normalize('NFC', text)
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    #Remove URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '<URL>', text)


    return text

In [18]:
train_df.head()

Unnamed: 0,label,tweet
0,0,اعترف ان بتس كانو شوي شوي يجيبو راسي لكن اليوم بالزايد 😭
1,0,توقعت اذا جات داريا بشوفهم كاملين بس لي للحين احس فيه احد ناقصهم 💔 #Avlu
2,0,#الاهلي_الهلال اكتب توقعك لنتيجة لقاء الهلال والاهلي تحت التاق 👇 #تحدي_اسرع_روقان وادخل في سحب قيمة ايفون X على…
3,0,نعمة المضادات الحيوية . تضع قطرة💧مضاد بنسلين على بكتيريا 🦠 فتنفجر 💥 و تموت . الأخيرة يبدو انها بكتيريا مقاومة فأخذ…
4,0,الدودو جايه تكمل علي 💔


# Handle text data

In [19]:
arabic_diacritics = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', str(text))
    return text


def remove_repeating_char(text):
    # return re.sub(r'(.)\1+', r'\1', text)     # keep only 1 repeat
    return re.sub(r'(.)\1+', r'\1\1', text)

def process_text(text, grams=False):
    clean_text = remove_diacritics(text)
    clean_text = remove_repeating_char(clean_text)
    if grams is False:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = list(window(tokens))
        grams = [' '.join(g) for g in grams]
        grams = grams + tokens
        return grams
    


In [20]:
from tqdm import tqdm

for i in tqdm(range(train_df.shape[0])):
    train_df.tweet[i] = process_text(train_df.tweet[i], False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
100%|██████████| 47000/47000 [00:19<00:00, 2460.66it/s]


In [21]:
train_df.shape

(47000, 2)

In [22]:
train_df.head()

Unnamed: 0,label,tweet
0,0,"[اعترف, ان, بتس, كانو, شوي, شوي, يجيبو, راسي, لكن, اليوم, بالزايد, 😭]"
1,0,"[توقعت, اذا, جات, داريا, بشوفهم, كاملين, بس, لي, للحين, احس, فيه, احد, ناقصهم, 💔, #Avlu]"
2,0,"[#الاهلي_الهلال, اكتب, توقعك, لنتيجة, لقاء, الهلال, والاهلي, تحت, التاق, 👇, #تحدي_اسرع_روقان, وادخل, في, سحب, قيمة, ايفون, X, على…]"
3,0,"[نعمة, المضادات, الحيوية, ., تضع, قطرة💧مضاد, بنسلين, على, بكتيريا, 🦠, فتنفجر, 💥, و, تموت, ., الأخيرة, يبدو, انها, بكتيريا, مقاومة, فأخذ…]"
4,0,"[الدودو, جايه, تكمل, علي, 💔]"


In [23]:
for i in tqdm(range(train_df.shape[0])):
    train_df.tweet[i] = " ".join(train_df.tweet[i])
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|██████████| 47000/47000 [00:17<00:00, 2708.34it/s]


In [24]:
train_df.head()

Unnamed: 0,label,tweet
0,0,اعترف ان بتس كانو شوي شوي يجيبو راسي لكن اليوم بالزايد 😭
1,0,توقعت اذا جات داريا بشوفهم كاملين بس لي للحين احس فيه احد ناقصهم 💔 #Avlu
2,0,#الاهلي_الهلال اكتب توقعك لنتيجة لقاء الهلال والاهلي تحت التاق 👇 #تحدي_اسرع_روقان وادخل في سحب قيمة ايفون X على…
3,0,نعمة المضادات الحيوية . تضع قطرة💧مضاد بنسلين على بكتيريا 🦠 فتنفجر 💥 و تموت . الأخيرة يبدو انها بكتيريا مقاومة فأخذ…
4,0,الدودو جايه تكمل علي 💔


In [34]:
from sklearn.model_selection import train_test_split
X = train_df.tweet.values
y = train_df.label.values

# The train val split is used by the DL approach but not classical ML
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1, random_state=2020)

In [25]:
df_ss2030 = pd.read_csv("../input/arabic-sentiment-analysis-dataset-ss2030-dataset/Arabic Sentiment Analysis Dataset - SS2030.csv")
# Rename columns to match convention
df_ss2030 = df_ss2030.rename(columns = {"text":"tweet", "Sentiment": "label"})

In [26]:
df_ss2030.head(2)

Unnamed: 0,tweet,label
0,حقوق المرأة 💚💚💚 https://t.co/Mzf90Ta5g1,1
1,RT @___IHAVENOIDEA: حقوق المرأة في الإسلام. https://t.co/ps3qNw1CbB,1


In [27]:
df_ss2030.shape

(4252, 2)

In [28]:
df_reviews = pd.read_csv("../input/arabic-100k-reviews/ar_reviews_100k.tsv", delimiter="\t")
# Create a mapping for the labels such that we use the same convention across all datasets
label_mapping = {"Positive": 1, "Negative":0}
# Filter to only have pos and neg tweets, i.e: remove mixed tweets
df_reviews = df_reviews[df_reviews.label != "Mixed"]
df_reviews["label"] = df_reviews["label"].map(label_mapping)
# Rename columns to match convention
df_reviews = df_reviews.rename(columns = {"text":"tweet"})

In [29]:
df_reviews.head(2)

Unnamed: 0,label,tweet
0,1,ممتاز نوعا ما . النظافة والموقع والتجهيز والشاطيء. المطعم
1,1,أحد أسباب نجاح الإمارات أن كل شخص في هذه الدولة يعشق ترابها. نحن نحب الإمارات. ومضات من فكر. نصائح لدولة تطمح بالصفوف الأولى و قائد لا يقبل إلا براحة شعبه وتوفر كل سب العيش الكريم. حكم و مواقف ونصائح لكل فرد فينا ليس بمجرد كتاب سياسي كما كنت اعتقد. يستحق القراءة مرات كثيرة


In [30]:
df_reviews.shape

(66666, 2)

In [31]:
train_df.head(3)

Unnamed: 0,label,tweet
0,0,اعترف ان بتس كانو شوي شوي يجيبو راسي لكن اليوم بالزايد 😭
1,0,توقعت اذا جات داريا بشوفهم كاملين بس لي للحين احس فيه احد ناقصهم 💔 #Avlu
2,0,#الاهلي_الهلال اكتب توقعك لنتيجة لقاء الهلال والاهلي تحت التاق 👇 #تحدي_اسرع_روقان وادخل في سحب قيمة ايفون X على…


In [35]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=31000)
tfidff = vectorizer.fit_transform(X_test).toarray()
features = vectorizer.get_feature_names()
Xtest_df = pd.DataFrame(tfidff, columns=features)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=31000)
tfidf = vectorizer.fit_transform(X).toarray()
features = vectorizer.get_feature_names()
tfidf = pd.DataFrame(tfidf, columns=features)

In [37]:
tfidf.head(2)

Unnamed: 0,00,00am,00ص,04,07,08,10,100,11,12,...,ﻳﺼﺢ,ﻳﺼﺪﺭ,ﻳﻈﻦ,ﻳﻌﺸﻘﻮﻥ,ﻳﻘﺘﺮﺏ,ﻳﻘﺮﺃﺅﻧﺎ,ﻳﻤگن,ﻷن,ﻻمهلكة,ﻻيمسها
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# deleting the correlated features 

In [None]:
cor_matrix = tfidf.corr().abs()
print(cor_matrix)

In [None]:
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
print(upper_tri)

In [None]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
print();
print(to_drop)

In [None]:
len(to_drop)

In [None]:
# tfidf = tfidf.drop(tfidf.columns[to_drop], axis=1)
for col in to_drop:
    tfidf.drop(col, axis=1, inplace=True)
        


In [None]:
tfidf.head(2)

# Eliminate the zero observations and Use log probabilities

In [None]:
for i in range(tfidf.shape[0]):
    for j in range(tfidf.shape[0]):
        tfidf.loc[i:j]=tfidf.loc[i:j]+1.5

In [None]:
for i in range(tfidf.shape[0]):
    for j in range(tfidf.shape[0]):
        tfidf.loc[i:j]=np.log(tfidf1.loc[i:j])

In [42]:
# Helper functions 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
def train_model(model, data, targets):
    
    model.fit(data, targets)
    return model
def get_accuracy(trained_model,X, y):
    predicted = trained_model.predict(X)
    accuracy = np.mean(predicted == y)
    return accuracy

In [43]:
from sklearn.naive_bayes import MultinomialNB
trained_clf_multinomial_nb = train_model(MultinomialNB(), tfidf, y)
accuracy = get_accuracy(trained_clf_multinomial_nb,Xtest_df, y_test)
print(f"Test dataset accuracy with MultinomialNB: {accuracy:.2f}")

Test dataset accuracy with MultinomialNB: 0.48


# ShuffleSplit Cross Validation

In [45]:
def print_all_accuracies(dataset_name, dataset):
  accuracy = get_accuracy(trained_clf_multinomial_nb,dataset.tweet.values, dataset.label.values)
  print(f"{dataset_name} dataset accuracy with Multinomial NB: {accuracy:.2f}")



In [None]:
print_all_accuracies("SS2030", df_ss2030)
print_all_accuracies("100k Arabic Reviews", df_reviews)

# GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
hyperparameters = {"alpha":(0,0.1, 0.2, 0.3, 0.4, 0.5 ,  0.8 , 0.88, 0.9, 1 ,2, 3,4 ,5, 6), "fit_prior":(0.5,0.5)}
Grid= GridSearchCV(estimator=MultinomialNB(), param_grid=hyperparameters, cv=10, scoring="accuracy", n_jobs=-1)
train_model(Grid, X, y)

In [None]:
Grid.best_estimator_

In [None]:
trained_clf_multinomial_nb_imm = train_model(MultinomialNB(alpha=0.8, fit_prior=[0.5,0.5]), X, y)
accuracy = get_accuracy(trained_clf_multinomial_nb_imm,X_test, y_test)
print(f"Test dataset accuracy with MultinomialNB: {accuracy:.2f}")

In [None]:
from sklearn.model_selection import GridSearchCV
hyperparameters = {"alpha":(0, 0.8,0.88, 0.9, 1 ,2,3,4,5,6),}
Grid= GridSearchCV(estimator=MultinomialNB(), param_grid=hyperparameters, cv=10, scoring="accuracy", n_jobs=-1)
train_model(Grid, tfidf, y)
print("Best score: %0.3f" % Grid.best_score_)
print("Best parameters set:")
best_parameters = Grid.best_estimator_.get_params()
for param_name in sorted(hyperparameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
trained_clf_multinomial_nb_im = train_model(MultinomialNB(alpha=0.8, fit_prior=(0.5,0.5)), X, y)
accuracy = get_accuracy(trained_clf_multinomial_nb_im,Xtest_df, y_test)
print(f"Test dataset accuracy with MultinomialNB: {accuracy:.2f}")

In [None]:
def print_all_im_accuracies(dataset_name, dataset):
  accuracy = get_accuracy(trained_clf_multinomial_nb_im,dataset.tweet.values, dataset.label.values)
  print(f"{dataset_name} dataset accuracy with Multinomial NB: {accuracy:.2f}")

In [None]:
print_all_im_accuracies("SS2030", df_ss2030)
print_all_im_accuracies("100k Arabic Reviews", df_reviews)