## setting up the environement

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pyarabic
!pip install Arabic-Stopwords

In [None]:
nltk.download('stopwords')

# LABR dataset

- LABR stands for Large-Scale Arabic Book Reviews.
- It's a collection of over 63,000 book reviews written in Arabic.
- Each review comes with a rating on a scale of 1 to 5 stars.

## loading libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import arabicstopwords.arabicstopwords as stp
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import pyarabic.araby as araby

## Loading dataset

The review.tsv file contains :

- rating - review id - user id - book id - review

In [None]:
# loading the tsv file
SEED = 21

dataset = pd.read_csv("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/reviews.tsv", sep = '\t', header=None, names = ["rating","review_id","user_id","book_id","review"])

dataset = dataset.sample(frac=1, random_state = SEED)

dataset.head(20)

In [None]:
# checking the shape of the dataset
dataset.shape

In [None]:
# checking for missing values
dataset.info()

In [None]:
# checking for missing values
dataset.describe()

In [None]:
# checking for missing values
dataset.isnull().sum()

In [None]:
# checking for missing values
dataset.nunique()

In [None]:
# checking for the distribution of the ratings
print(dataset['rating'].value_counts(normalize=True) * 100)

In [None]:
# plotting the distribution of the ratings
print(dataset['rating'].value_counts(normalize=True).plot(kind='bar'))

## Preprocessing

In [None]:
# dropping the review_id, user_id and book_id columns
dataset = dataset.drop(['review_id','user_id','book_id'], axis=1)

In [None]:
# dropping the duplicates and keeping the first occurence
dataset = dataset.drop_duplicates(subset='review', keep='first')

In [None]:
# removing whitespaces

pattern = r'\s+|\n+'
dataset["review"] = dataset["review"].apply(lambda document: re.sub(pattern, ' ', document))

In [None]:
# removing punctuations

pattern = r'[^\w\s\u0600-\u06FF]+|ﷺ|۩|⓵|؟|۞|ﷻ'
dataset["review"] = dataset["review"].apply(lambda document: re.sub(pattern, '', document))

In [None]:
# removing consecutive characters in arabic 

pattern = r'(.)\1+'
dataset["review"] = dataset["review"].apply(lambda document: re.sub(pattern, r'\1', document))

In [None]:
# removing stop words 

stop_words = set(stopwords.words('arabic'))
stop_words.update(stp.stopwords_list())
dataset["review"] = dataset["review"].apply(lambda document: ' '.join([word for word in document.split() if word not in stop_words]))

In [None]:
# removing arabic diactrics

dataset["review"] = dataset["review"].apply(lambda document: araby.strip_tashkeel(document))

In [None]:
# removing numbers 

dataset["review"] = dataset["review"].apply(lambda document: ''.join([i for i in document if not i.isdigit()]))

In [None]:
# removing english alphabets 

dataset["review"] = dataset["review"].apply(lambda document: re.sub(r'[a-zA-Z]+', '', document))

In [None]:
# tokenizing the reviews using nltk 

import nltk 
from nltk.tokenize import word_tokenize

dataset["review"] = dataset["review"].apply(lambda document: word_tokenize(document))

In [None]:
# stemming the reviews using nltk

stemmer = ISRIStemmer()
dataset["review"] = dataset["review"].apply(lambda document: [stemmer.stem(word) for word in document])

In [None]:
# checking the first 20 rows of the dataset

dataset.head(20)

## Text representation

In [None]:
# saving the cleaned dataset

dataset.to_csv("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/cleaned_reviews.tsv", sep = '\t', index=False) 

In [None]:
# loading the cleaned dataset

dataset = pd.read_csv("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/cleaned_reviews.tsv", sep = '\t')

In [None]:
# bag of words 

from sklearn.feature_extraction.text import CountVectorizer

X_train , X_test = train_test_split(dataset, test_size=0.2, random_state=SEED)

In [None]:
# creating the bag of words model

cv = CountVectorizer(max_features=5000)
X_train = cv.fit_transform(X_train['review']).toarray()
X_test = cv.transform(X_test['review']).toarray()

In [None]:
# creating the target variable

y_train = dataset.iloc[X_train.index]['rating']
y_test = dataset.iloc[X_test.index]['rating']

In [None]:
# saving the bag of words model

import pickle

with open("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/models/bag_of_words_model.pkl", "wb") as file:
    pickle.dump(cv, file)

# saving the train and test sets

np.save("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/X_train.npy", X_train)
np.save("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/X_test.npy", X_test)
np.save("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/y_train.npy", y_train)
np.save("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/y_test.npy", y_test)

In [None]:
# loading the bag of words model

with open("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/models/bag_of_words_model.pkl", "rb") as file:
    cv = pickle.load(file)

In [None]:
# loading the train and test sets

X_train = np.load("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/X_train.npy")
X_test = np.load("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/X_test.npy")

y_train = np.load("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/y_train.npy")
y_test = np.load("/content/drive/MyDrive/Parcours Academique/ENSAM/PFA/datasets/LABR/y_test.npy")

## Performance evaluation

In [None]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# training the model
gnb = GaussianNB()

gnb.fit(X_train, y_train)

# making predictions

y_pred = gnb.predict(X_test)

# calculating the accuracy

accuracy = accuracy_score(y_test, y_pred)

print(f"The accuracy of the model is: {accuracy}")


In [None]:
# logistic regression

from sklearn.linear_model import LogisticRegression

# training the model
lr = LogisticRegression(max_iter=1000)

lr.fit(X_train, y_train)

# making predictions

y_pred = lr.predict(X_test)

# calculating the accuracy

accuracy = accuracy_score(y_test, y_pred)

print(f"The accuracy of the model is: {accuracy}")

In [None]:
# SVM 

from sklearn.svm import SVC

# training the model

svm = SVC()

svm.fit(X_train, y_train)

# making predictions

y_pred = svm.predict(X_test)

# calculating the accuracy

accuracy = accuracy_score(y_test, y_pred)

print(f"The accuracy of the model is: {accuracy}")

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

# training the model

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

# making predictions

y_pred = rf.predict(X_test)

# calculating the accuracy

accuracy = accuracy_score(y_test, y_pred)

print(f"The accuracy of the model is: {accuracy}")

## TF-IDF

In [None]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

X_train , X_test = train_test_split(dataset, test_size=0.2, random_state=SEED)

# creating the tf-idf model

tfidf = TfidfVectorizer(max_features=5000)

X_train = tfidf.fit_transform(X_train['review']).toarray()

X_test = tfidf.transform(X_test['review']).toarray()

In [None]:
# logistic regression

from sklearn.linear_model import LogisticRegression

# training the model

lr = LogisticRegression(max_iter=1000)

lr.fit(X_train, y_train)

# making predictions

y_pred = lr.predict(X_test)

# calculating the accuracy

accuracy = accuracy_score(y_test, y_pred)

print(f"The accuracy of the model is: {accuracy}")

In [None]:
# naive bayes

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# training the model

gnb = MultinomialNB()

gnb.fit(X_train, y_train)

# making predictions

y_pred = gnb.predict(X_test)

# calculating the accuracy

accuracy = accuracy_score(y_test, y_pred)

print(f"The accuracy of the model is: {accuracy}")