## setting up the environement

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyarabic
!pip install Arabic-Stopwords

Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m92.2/126.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.15
Collecting Arabic-Stopwords
  Downloading Arabic_Stopwords-0.4.3-py3-none-any.whl (360 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.5/360.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Arabic-Stopwords
Successfully installed Arabic-Stopwords-0.4.3


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# LABR dataset

- LABR stands for Large-Scale Arabic Book Reviews.
- It's a collection of over 63,000 book reviews written in Arabic.
- Each review comes with a rating on a scale of 1 to 5 stars.

## loading libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import arabicstopwords.arabicstopwords as stp
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import pyarabic.araby as araby

## Loading dataset

The review.tsv file contains :

- rating - review id - user id - book id - review

In [6]:
# loading the tsv file
SEED = 21

dataset = pd.read_csv("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/reviews.tsv", sep = '\t', header=None, names = ["rating","review_id","user_id","book_id","review"])

dataset = dataset.sample(frac=1, random_state = SEED)

dataset.head(20)

Unnamed: 0,rating,review_id,user_id,book_id,review
12152,4,415481260,6391071,9440012,ثقافة للحياة | يوميات نائب في الأرياف لــتوفي...
29753,1,501026526,4128410,3482519,كتاب لا يستحق اكتر من ربع نحمه . روايه ضعيفة ...
9586,3,183473674,4619036,4653511,رواية مؤلمه تحكي قصة تهريب ثلاث رجال عبر خزان...
48087,3,150978864,3426623,7634660,لا أذكرها كثيرًا، لكنني أذكر أنه هذه القصص كا...
33422,3,171735375,2296621,6562799,الكتاب استعرض كم. من المعلومات شخصياً لم اكن ...
49840,5,232695834,4507446,3438113,عندما بدءت فى قراءة هذه الرواية لم استطع تركه...
26984,5,301073147,7785860,3553395,أعظ وصف ليـه هو عدم الوصف كتـاب لايمكن أن تعب...
55313,4,64603083,2551456,2370762,تحدثت جودي عن تجربة واقعية لمعاناة شخصية عاشت...
14987,3,92260435,882233,3191898,كتاب جيد يستعرض فيه الكاتب الياباني نوبواكي ن...
58434,5,454155323,14201507,13604882,كتاب لما تقرأه تحس إنك صغير أوى


In [None]:
# checking the shape of the dataset
dataset.shape

In [None]:
# checking for missing values
dataset.info()

In [None]:
# checking for missing values
dataset.describe()

In [7]:
# checking for missing values
dataset.isnull().sum()

rating       0
review_id    0
user_id      0
book_id      0
review       0
dtype: int64

In [None]:
# checking for missing values
dataset.nunique()

In [8]:
# checking for the distribution of the ratings
print(dataset['rating'].value_counts(normalize=True) * 100)

rating
5    37.589516
4    30.121568
3    19.287984
2     8.354807
1     4.646126
Name: proportion, dtype: float64


In [None]:
# plotting the distribution of the ratings
print(dataset['rating'].value_counts(normalize=True).plot(kind='bar'))

## Preprocessing

In [9]:
# dropping the review_id, user_id and book_id columns
dataset = dataset.drop(['review_id','user_id','book_id'], axis=1)

In [10]:
# dropping the duplicates and keeping the first occurence
dataset = dataset.drop_duplicates(subset='review', keep='first')

In [12]:
# removing whitespaces
import re
pattern = r'\s+|\n+'
dataset["review"] = dataset["review"].apply(lambda document: re.sub(pattern, ' ', document))

In [13]:
# removing punctuations

pattern = r'[^\w\s\u0600-\u06FF]+|ﷺ|۩|⓵|؟|۞|ﷻ'
dataset["review"] = dataset["review"].apply(lambda document: re.sub(pattern, '', document))

In [14]:
# removing consecutive characters in arabic

pattern = r'(.)\1+'
dataset["review"] = dataset["review"].apply(lambda document: re.sub(pattern, r'\1', document))

In [15]:
# removing stop words

stop_words = set(stopwords.words('arabic'))
stop_words.update(stp.stopwords_list())
dataset["review"] = dataset["review"].apply(lambda document: ' '.join([word for word in document.split() if word not in stop_words]))

In [16]:
# removing arabic diactrics

dataset["review"] = dataset["review"].apply(lambda document: araby.strip_tashkeel(document))

In [17]:
# removing numbers

dataset["review"] = dataset["review"].apply(lambda document: ''.join([i for i in document if not i.isdigit()]))

In [18]:
# removing english alphabets

dataset["review"] = dataset["review"].apply(lambda document: re.sub(r'[a-zA-Z]+', '', document))

In [20]:
# tokenizing the reviews using nltk

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

dataset["review"] = dataset["review"].apply(lambda document: word_tokenize(document))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [21]:
# stemming the reviews using nltk

stemmer = ISRIStemmer()
dataset["review"] = dataset["review"].apply(lambda document: [stemmer.stem(word) for word in document])

In [22]:
# checking the first 20 rows of the dataset

dataset.head(20)

Unnamed: 0,rating,review
12152,4,"[ثقف, لحا, يوم, نئب, ريف, ـتوفيق, حكم, قرء, لم..."
29753,1,"[كتب, سحق, كتر, ربع, نحم, ريه, ضعف, بنء, سوخ]"
9586,3,"[روي, ؤلم, تحك, قصة, هرب, رجل, عبر, خزن, ماء, ..."
48087,3,"[ذكر, كثيرا،, ذكر, لقص, كانت, شوق, جدا, وقت, ذ..."
33422,3,"[كتب, عرض, علم, شخص, اكن, درك, وقت, كانت, كمل,..."
49840,5,"[بدء, فى, قرء, روي, سطع, ترك, نهي, سلب, ماركيز..."
26984,5,"[اعظ, وصف, ليـ, عدم, وصف, كتـاب, ليم, عبر, كلم]"
55313,4,"[حدث, جود, جرب, وقع, لمع, شخص, عشت, فصل, رحل, ..."
14987,3,"[كتب, جيد, عرض, كتب, يبا, وبو, نظر, تجه, عرب, ..."
58434,5,"[كتب, قرأ, تحس, صغر, اوى]"


## Text representation

In [23]:
# saving the cleaned dataset

dataset.to_csv("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/cleaned_reviews.tsv", sep = '\t', index=False)

In [24]:
# loading the cleaned dataset

dataset = pd.read_csv("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/cleaned_reviews.tsv", sep = '\t')

In [30]:
# bag of words
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X_train , X_test , y_train , y_test = train_test_split(dataset['review'], dataset['rating'], test_size = 0.2, random_state = SEED)

In [32]:
# creating the bag of words model

cv = CountVectorizer(max_features=5000)
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

## Performance evaluation

In [33]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# training the model
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# making predictions
y_pred = gnb.predict(X_test)

# calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"The accuracy of the model is: {accuracy}")


The accuracy of the model is: 0.1288899983358296


In [None]:
# logistic regression

from sklearn.linear_model import LogisticRegression

# training the model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# making predictions
y_pred = lr.predict(X_test)

# calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"The accuracy of the model is: {accuracy}")

In [None]:
# SVM 



scaler = StandardScaler()
svm = SVC()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

componenets = [100, 200, 500]
scores = []
for n_components in componenets:
    pca = PCA(n_components = n_components)
    X_train_SVM = pca.fit_transform(scaled_X_train)
    X_test_SVM  = pca.transform(scaled_X_test)

    svm.fit(X_train_SVM, y_train)
    y_pred = svm.predict(X_test_SVM)
    accuracy = accuracy_score(y_test, y_pred)
    scores.append(accuracy)

plt.plot(componenets, scores)
plt.xlabel('# of componenets')
plt.ylabel('Accuracy')
plt.show()

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# training the model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# making predictions
y_pred = rf.predict(X_test)

# calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"The accuracy of the model is: {accuracy}")

## TF-IDF

In [None]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

X_train , X_test , y_train , y_test = train_test_split(dataset['review'], dataset['rating'], test_size = 0.2, random_state = SEED)

# creating the TF-IDF model 

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

### Performance Evaluation

In [None]:
# logistic regression

from sklearn.linear_model import LogisticRegression

# training the model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# making predictions
y_pred = lr.predict(X_test)

# calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"The accuracy of the model is: {accuracy}")

In [None]:
# naive bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# training the model
gnb = MultinomialNB()
gnb.fit(X_train, y_train)

# making predictions
y_pred = gnb.predict(X_test)

# calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"The accuracy of the model is: {accuracy}")

In [None]:
# SVM 
scaler = StandardScaler()
pca = PCA(n_components = 200)
svm = SVC()

X_train_SVM = pca.fit_transform(scaler.fit_transform(X_train.copy()))
X_test_SVM = pca.transform(scaler.transform(X_test.copy()))

y_train_SVM = y_train
y_test_SVM = y_test

svm.fit(X_train_SVM, y_train_SVM)
y_pred = svm.predict(X_test_SVM)
accuracy = accuracy_score(y_test_SVM, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Random Forest 

rf = RandomForestClassifier()

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest accuracy: {accuracy * 100:.2f}%")

## LDA

In [None]:
# LDA 
X_train, X_test, y_train, y_test = train_test_split(data["review"], data["rating"], test_size = 0.2, random_state = SEED, stratify = data["rating"])

vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

lda = LatentDirichletAllocation(n_components = 170, random_state = SEED)
lda.fit(X_train)
X_train = lda.transform(X_train)
X_test = lda.transform(X_test)

## LSA

In [None]:
lsa = TruncatedSVD(n_components = 10, random_state = SEED)
X_train_lsa = lsa.fit_transform(X_train)
X_test_lsa = lsa.transform(X_test)
model = LogisticRegression(max_iter = 1000)
model.fit(X_train_lsa, y_train)
y_pred = model.predict(X_test_lsa)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

## Bag of Concepts

## Word Embeddings

In [None]:
model = gensim.models.Word2Vec.load("./aravec/tweets_cbow_300")
word_vecs = {}
for tweet in data["tweet"]:
    for word in tweet.split(" "):
        try:
            word_vecs[word] = model.wv[word]
        except Exception:
            pass

## Clustering the words embeddings


In [None]:
NUM_CONCEPTS = 80

model = KMeans(n_clusters = NUM_CONCEPTS)
X = list(word_vecs.values())
model.fit(X)
concepts = model.predict(X)

## Concept Extraction

In [None]:
NUM_DOCS = data.shape[0]

# construct a word to concept mapping
word_concept = {}
for index, word in enumerate(word_vecs.keys()):
    word_concept[word] = concepts[index]
print(word_concept)

In [None]:
# construct a concept to document count mapping
concept_docs = defaultdict(int)
for doc in data["tweet"]:
    doc_concepts = set()
    for word in doc.split(" "):
        try:
            doc_concepts.add(word_concept[word])
        except Exception:
            pass
    for concept in doc_concepts:
        concept_docs[concept] += 1
print(concept_docs)

In [None]:
def cf_idf(document: str):
    """ Returns the CD-IDF representataion of a document """
    res = [0 for _ in range(NUM_CONCEPTS)]
    concepts_counts = defaultdict(int)
    for word in document.split(" "):
        try:
            concepts_counts[word_concept[word]] += 1
        except:
            pass
    n_k = sum(concepts_counts.values()) # number of concepts present in the document (duplicates are considered!)
    for concept in range(NUM_CONCEPTS):
        if concepts_counts[concept] != 0:
            res[concept] = (concepts_counts[concept] / n_k) * log(NUM_DOCS / (1 + concept_docs[concept]))
    return res

In [None]:
X = [cf_idf(tweet) for tweet in data["tweet"]]
y = data["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = SEED, stratify = y)

### Performance Evaluation

In [None]:
# Naive Bayes

model = GaussianNB()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Logistic Regression

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# SVM 
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")