# Arabic Tweets Sentiment Anaylsis

## Introduction

### Dataset

* This dataset was collected to provide Arabic sentiment corpus for the research community to investigate deep learning approaches for Arabic sentiment analysis.

* This dataset we collected in April 2019. It contains 58K Arabic tweets (47K training, 11K test) tweets annotated in positive and negative labels. The dataset is balanced and collected using positive and negative emojis lexicon.



## Exploring and downloading the Text data files

In [None]:
import pandas as pd

### Train Dataframe

In [None]:
tweets_data_negative = pd.read_csv('data/train_Arabic_tweets_negative_20190413.tsv',sep='\t',header=None, names=['sentiment','tweets'], encoding='utf-8')
tweets_data_positive = pd.read_csv('data/train_Arabic_tweets_positive_20190413.tsv',sep='\t',header=None, names=['sentiment','tweets'], encoding='utf-8')

In [None]:
tweets_data_negative.head()

In [None]:
tweets_data_positive.head()

In [None]:
tweets_data_negative.info()

In [None]:
tweets_data_positive.info()

In [None]:
df = pd.concat([tweets_data_negative,tweets_data_positive], ignore_index=True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.to_csv('fulldf.csv')

In [None]:
df['sentiment'].unique()

In [None]:
df.sentiment.value_counts()

In [None]:
df.isna().sum()

### Test Dataframe

In [None]:
test_tweets_negative = pd.read_csv('data/test_Arabic_tweets_negative_20190413.tsv',sep='\t',header=None,  names=['sentiment','tweets'], encoding='utf-8')
test_tweets_positive = pd.read_csv('data/test_Arabic_tweets_positive_20190413.tsv',sep='\t',header=None,  names=['sentiment','tweets'], encoding='utf-8')

In [None]:
df_test = pd.concat([test_tweets_negative,test_tweets_positive], ignore_index=True)

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.sentiment.value_counts()

## Data Exploration and cleaning

In [None]:
import numpy as np
from collections import Counter
import nltk
import re as regex
import plotly
from plotly import graph_objs
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
#from time import time
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import cufflinks as cf

nltk.download('punkt')

In [None]:
sns.countplot(x='sentiment',data=df)

### cleaning the data

In [None]:
import string
import sys
import argparse

punctuatuions = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation
#nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words("arabic")

arabic_diacritics = regex.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, regex.VERBOSE)

def clean_tweets(tweet):
    #remove punctuations
    translator = str.maketrans('','',punctuatuions)
    tweet = tweet.translate(translator)
    
    #stopwords
    tweet = ' '.join(word for word in tweet.split() if word not in stopwords)
    
    #remove longation
    tweet = regex.sub("[إأآا]", "ا", tweet)
    tweet = regex.sub("ى", "ي", tweet)
    tweet = regex.sub("ؤ", "ء", tweet)
    tweet = regex.sub("ئ", "ء", tweet)
    tweet = regex.sub("ة", "ه", tweet)
    tweet = regex.sub("گ", "ك", tweet)
    
    # remove URL
    tweet = regex.sub('http\S+\s*', ' ', tweet)
    
    # Remove usernames
    tweet = regex.sub(r"@[^\s]+[\s]?",'',tweet)
    
    # remove special characters 
    tweet = regex.sub("@[ا-ي0-9]+", " ", tweet)
    tweet = regex.sub("[أ-ي]#+", " ", tweet)
    tweet = regex.sub("#[أ-ي]+", " ", tweet)
    
    # remove Numbers
    tweet = regex.sub('^[\u0621-\u064A\u0660-\u0669 ]+$', '', tweet)
    tweet = regex.sub('\.+', '', tweet)
    tweet = regex.sub(':', '', tweet)
    tweet = regex.sub('!', '', tweet)
    tweet = regex.sub('،','',tweet)
    tweet = regex.sub('-','',tweet)
    tweet = regex.sub('_','',tweet)
    
    # remove Tashkeel
    tweet = regex.sub(arabic_diacritics, '', tweet)
    
    return tweet

In [None]:
df['tweets'] = df['tweets'].apply(clean_tweets)

In [None]:
df.head()

## Tokenization And Stemming

### Tokenization

In [None]:
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import  word_tokenize

tokenizer = TweetTokenizer()
df['tweets'].apply(tokenizer.tokenize)

### Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()

In [None]:
def tokenize(text):
    return tokenizer.tokenize(text)

def stemming(words):
    stem_words = []
    for word in words:
        word = ps.stem(word)
        stem_words.append(word)
    return stem_words

In [None]:
df['Tokenized'] =df['tweets'].apply(tokenizer.tokenize)

In [None]:
df['stemmed'] = df['tweets'].apply(stemming)

In [None]:
df.head()

In [None]:
df.shape

##  bag-of-words 

In [None]:
words_freq = Counter()
for idx in df.index:
    words_freq.update(df.loc[idx,"Tokenized"])

In [None]:
words_freq.most_common(10)

In [None]:
df.head()

In [None]:
stopwords

In [None]:
def word_list(processed_data):
    #print(processed_data)
    min_occurrences=3 
    max_occurences=500 
    stopwords=nltk.corpus.stopwords.words("arabic")
    wordlist = []
    
    words_freq = Counter()
    for idx in processed_data.index:
        words_freq.update(processed_data.loc[idx, "Tokenized"])

    word_df = pd.DataFrame(data={"word": [k for k, w in words_freq.most_common() if min_occurrences < w < max_occurences],
                                 "occurrences": [w for k, w in words_freq.most_common() if min_occurrences < w < max_occurences]
                                },
                           columns=["word", "occurrences"])
    #print(word_df)
    word_df.to_csv("wordlist.csv", index_label="idx")
    wordlist = [k for k, w in words_freq.most_common() if min_occurrences < w < max_occurences]
    #print(wordlist)

In [None]:
word_list(df)

In [None]:
words = pd.read_csv("wordlist.csv", encoding="utf8")
words.head()

## Features


In [None]:
#Length of tweets
df['Tweets_len'] = df['Tokenized'].apply(len)

In [None]:
df.head()

In [None]:
for emotion in df['sentiment'].unique():
    df[df['sentiment']==emotion]['Tweets_len'].plot(
      kind='hist',
      bins=20,
      title='Length')
    plt.xlabel(f'Text length for {emotion} emotion')
    plt.show()

In [None]:
df['word_count'] = df["Tokenized"].apply(lambda x: len(str(x).split(" ")))
df['char_count'] = df["Tokenized"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
df['sentence_count'] = df["Tokenized"].apply(lambda x: len(str(x).split(".")))
df['avg_word_length'] = df['char_count'] / df['word_count']
df['avg_sentence_lenght'] = df['word_count'] / df['sentence_count']
df.head()

In [None]:
df.describe()

In [None]:
sentiments = df.groupby('sentiment').mean()
sentiments

In [None]:
sentiments.corr()

In [None]:
sns.heatmap(sentiments.corr(),cmap='coolwarm',annot=True)

In [None]:
df.drop('sentence_count',
  axis='columns', inplace=True)

In [None]:
sentiments = df.groupby('sentiment').mean()

In [None]:
sns.heatmap(sentiments.corr(),cmap='coolwarm',annot=True)

In [None]:
# words = pd.read_csv("wordlist.csv", encoding="utf8")
# words.head()

## After cleaning :

In [None]:
df.head()

In [None]:

cleaned_df = df[['sentiment','tweets']]
df.to_csv('cleaned_df.csv')

## ML Modeling

In [None]:
import sklearn 
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
import random


### Logistic regression using TfidfVectorizer on tweets

In [None]:
# splitting the data into featuers and target
feature = df.tweets
target = df['sentiment']
# splitting into train and tests
X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size =.2, random_state=100)

# make pipeline
pipe = make_pipeline(TfidfVectorizer(),
                    LogisticRegression())
# make param grid
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

# create and fit the model
model = GridSearchCV(pipe, param_grid, cv=5)
model.fit(X_train,Y_train)

# make prediction and print accuracy
prediction = model.predict(X_test)
print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))

### Logistic regression using text feature

In [None]:
#Xt = df.loc[:,'Tweets_len':'avg_sentence_lenght']
Xt = df.loc[:,['avg_word_length','avg_sentence_lenght']]
y = df['sentiment']

In [None]:
Xt.head()

In [None]:

Xt_train, Xt_test, y_train, y_test = train_test_split(Xt, y, test_size = .2, random_state=42)

In [None]:
lr_model = LogisticRegression()
lr_model.fit(Xt_train,y_train)
prediction = lr_model.predict(Xt_test)
print(classification_report(y_test, prediction))

In [None]:
from sklearn.naive_bayes import MultinomialNB
sentiment_model = MultinomialNB().fit(Xt_train, y_train)
prediction = sentiment_model.predict(Xt_test)
print(classification_report(y_test, prediction))

### Logistic regression using TfidfVectorizer and BoW

#### BoW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
#get dataset bag-of-words counts as a vector
bow_transformer = CountVectorizer(analyzer=tokenize).fit(df['tweets'])

In [None]:
# BoW vector representation
messages_bow = bow_transformer.transform(df['tweets'])

#### TfidfVectorizer

In [None]:
#transform the entire bag-of-words corpus into TF-IDF corpus 
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

In [None]:
# X = df.loc[:,["tweets",'avg_word_length','avg_sentence_lenght']]
X=messages_tfidf
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=42)

#### LogisticRegression

In [None]:
lr_ = LogisticRegression()
lr_.fit(X_train,y_train)
prediction = lr_.predict(X_test)
print(classification_report(y_test, prediction))

#### MultinomialNB

In [None]:
sentiment_model_ = MultinomialNB().fit(X_train, y_train)
prediction = sentiment_model_.predict(X_test)
print(classification_report(y_test, prediction))

## DL Modeling

In [None]:
# Imports
# Basics
import pandas as pd 
import numpy as np
import random
from matplotlib import pyplot as plt
%matplotlib inline

# gensim
import gensim

# keras
np.random.seed(13)
from keras import layers
from keras.models import Sequential
from keras.layers import (Dense, Embedding, Reshape, Activation, 
                          SimpleRNN, LSTM, Convolution1D, 
                          MaxPooling1D, Dropout, Bidirectional, SpatialDropout1D)
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.datasets import imdb, reuters
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, RMSprop


# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# The original data
data = pd.read_csv('fulldf.csv', encoding='utf-8')
df = pd.read_csv('cleaned_df.csv', encoding='utf-8')

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoded the target column
lb=LabelEncoder()
data_v1 = df[['tweets','sentiment']]

data_v1['sentiment'] = lb.fit_transform(df['sentiment'])
#data_v1.sentiment=data_v1.sentiment.astype(str)

data_v1.head()

In [None]:
#Tokenizing and converting the tweets into numerical vectors.
from keras.preprocessing.text import Tokenizer

data_v1.tweets=data_v1.tweets.astype(str)
tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(data_v1['tweets'].values)
X = tokenizer.texts_to_sequences(data_v1['tweets'].values)
X = pad_sequences(X)

In [None]:
y = data_v1['sentiment']
print(X.shape)
print(y.shape)
y.head()


### Building model

In [None]:
#create LSTM model with keras
embedding_dim = 100
dropout = 0.5
opt = 'adam'
#clear_session()
model = Sequential()
model.add(layers.Embedding(input_dim=500, 
                           output_dim=100, 
                           input_length=X.shape[1]))
model.add(layers.Bidirectional(layers.LSTM(100, dropout=0.5, 
                                           recurrent_dropout=0.5, 
                                           return_sequences=True)))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=opt, 
              loss='binary_crossentropy', 
              metrics=['accuracy'])
model.summary()

In [None]:
#Splitting the data into training and testing
from sklearn.model_selection import train_test_split

y=pd.get_dummies(data_v1['sentiment'])
X_train, X_test, label_train, label_test = train_test_split(X, data_v1['sentiment'], test_size=0.3, random_state=42)
print("Training:", len(X_train))
print("Testing: ", len(X_test))

In [None]:
history = model.fit(X_train, label_train, epochs = 5, batch_size=32 , verbose = 'auto')

In [None]:
model.evaluate(X_test,label_test)


# Word embedding + LSTM


In [None]:
# Imports
# Basics
import pandas as pd 
import numpy as np
import random
from matplotlib import pyplot as plt
%matplotlib inline

# gensim
import gensim

# keras
np.random.seed(13)
from keras import layers
from keras.models import Sequential
from keras.layers import (Dense, Embedding, Reshape, Activation, 
                          SimpleRNN, LSTM, Convolution1D, 
                          MaxPooling1D, Dropout, Bidirectional, SpatialDropout1D)
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.datasets import imdb, reuters
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD, RMSprop


# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### Downloading data

In [None]:

from sklearn.preprocessing import LabelEncoder

# The original data
data = pd.read_csv('fulldf.csv', encoding='utf-8')

# Encoded the target column
lb=LabelEncoder()
df = pd.read_csv('cleaned_df.csv', encoding='utf-8')
df = df[['tweets','sentiment']]
df['sentiment'] = lb.fit_transform(df['sentiment'])
df.head()

In [None]:
#More cleaning
import re
from nltk.tokenize import word_tokenize, TweetTokenizer

tweets_lines = list()

lines = data['tweets'].values.tolist() #convert original lines to a list

lines_ = df['tweets'].values.tolist() #convert cleaned lines to a list

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

for line in lines_: #this loop will tokenize, strip the punctuation, remove token that are not alphabetic, remove stop words also
    
    #line = remove_emoji(str(line)) # no emoji
    
    #word tokenize 
    #tokens = word_tokenize(line)
    tt = TweetTokenizer()
    tokens = tt.tokenize(str(line))
    
    
    tweets_lines.append(tokens)

### Download Word2vec

In [None]:
import gensim

embedding_dim = 100 #specify dimensions of embeddings

model = gensim.models.Word2Vec(sentences=tweets_lines,
                               vector_size = embedding_dim, window=5,
                               min_count=1) #list of sentances (tokens)
words = list(model.wv.index_to_key) #vocab size

print('Vocabulary size: %d' % len(words))

In [None]:
#test the model
# sample = w2v.wv["حسن"]
# print(sample.shape)
# #print(sample)
# print(w2v.wv.most_similar("حسن"))
sample = model.wv["حسن"]
print(sample.shape)
#print(sample)
print(model.wv.most_similar("حسن"))

In [None]:
#Save model in ASCII (word2vec) format
from gensim.models import Word2Vec
filename = 'w2v_embedding_word2vec_result.txt'

model.wv.save_word2vec_format(filename,binary=False)

## Load the embeddings from the file into a dictionary

In [None]:
import os
embedding_index = {} #embeddings in a dict

f = open('w2v_embedding_word2vec_result.txt',encoding='utf-8')

for line in f:
  values = line.split() #split each line
  word = values[0] #first vector is always word
  coefs = np.asarray(values[1:])

  embedding_index[word]=coefs
f.close()

## **Splitting the data** 

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import numpy as np

validation_split = 0.2

tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(tweets_lines)

sequences = tokenizer_obj.texts_to_sequences(tweets_lines)

# to keep text constant, we will pad them with extra zeros (we will keep max 100)

max_length = 100

word_index = tokenizer_obj.word_index

tweets_pad = pad_sequences(sequences,maxlen=max_length)
sentiment = df['sentiment'].values

print(tweets_pad.shape)
print(sentiment.shape)

# indices = np.arange(tweets_pad.shape[0]) #shuffle data before spliting
# np.random.shuffle(indices)

# tweets_pad = tweets_pad[indices]
# sentiment = sentiment[indices]

# num_validation_samples = int(validation_split*tweets_pad.shape[0])

# x_val_pad = tweets_pad[:num_validation_samples]
# y_val_pad = sentiment[:num_validation_samples]

# X_train_pad = tweets_pad[num_validation_samples:]
# y_train = sentiment[num_validation_samples:]

In [None]:
# test_split = 0.1

# num_test_samples = int(test_split*X_train_pad.shape[0])
# X_test_pad = X_train_pad[:num_test_samples]
# y_test = y_train[:num_test_samples]

# X_train_pad = X_train_pad[num_test_samples:]
# y_train = y_train[num_test_samples:]

In [None]:
# print(X_train_pad.shape)
# print(y_train.shape)

# print(x_val_pad.shape)
# print(y_val_pad.shape)

# print(X_test_pad.shape)
# print(y_test.shape)

### Creating an Embedding Matrix


In [None]:
#transforming embeddings (dictioanry) into matrix as Keras accept it.

embedding_dim = 100

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words,embedding_dim))

for word, i in word_index.items(): #for each word and its values

    if i > num_words:
        continue
    embedding_vector = embedding_index.get(word)

    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Training 

Defining the Model

In Keras, A **Sequential model** is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor.
1. Load pre-trained word embeddings into an Embedding layer

2. Adding LSTM Layer

3. Adding Dense Layer

**model.compile** used to Configures the model for training.


**model.add()** function is used to add layers to our model.

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

model = Sequential()

embedding_layer = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)

model.add(embedding_layer)
model.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.2 )) #dropout to deactivate some of the neurons to make sure that it doesn't overfit model at every epoch 
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print('Summary of the built model..')

print(model.summary())

## Training the Model

Use the .fit method.

Assign the model.fit() method to a variable, which will store the Training, Validation Loss and Accuracy for each epoch. 

In [None]:
#split dataset
from sklearn.model_selection import train_test_split
labels = df['sentiment']
X_train, X_test, Y_train, Y_test = train_test_split(tweets_pad, labels, test_size= 0.3, random_state = 24)

In [None]:
history_no_val = model.fit(X_train,Y_train,batch_size=32,epochs=5,verbose=2)

In [None]:
history = model.fit(X_train_pad,y_train,batch_size=128,epochs=10, \
                    validation_data=(x_val_pad,y_val_pad),verbose=2)

In [None]:
score, acc = model.evaluate(X_test_pad,y_test,batch_size=128)

print('Test Score: ', score)
print('Test accuracy: ',acc)