<a href="https://colab.research.google.com/github/Shahad-Mohammed/NLP/blob/main/Sentment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import nltk as nltk
import re

from nltk.tokenize import word_tokenize
nltk.download('punkt')

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.stem.snowball import SnowballStemmer

from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
pd.set_option('display.max_colwidth', 500)   # set the column width in the output to 500 charachters

# Load and Explor Data

In [None]:

# read talbat reviews only the first  32073 rows.
# .iloc to delete the first and last column
reviews_df = pd.read_csv("https://drive.google.com/u/0/uc?id=1DrtPc0NLOv7hvIxlyCuFiQZIudSeeFy4&export=download", nrows=32073).iloc[: , 1:-1]

# rename the columns
reviews_df.columns = ['description','rating']

In [None]:
reviews_df.head()

Unnamed: 0,description,rating
0,رائع,1
1,برنامج رائع جدا يساعد على تلبيه الاحتياجات بشكل اسرع,1
2,التطبيق لا يغتح دائما بيعطيني لا يوجد اتصال بالشبكة..مع انه النت عندي تمام شو الحل??,-1
3,لماذا لا يمكننا طلب من ماكدونالدز؟,-1
4,البرنامج بيظهر كل المطاعم و مغلقه مع انها بتكون فاتحه بقاله كده اكتر من شهر,-1


In [None]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32073 entries, 0 to 32072
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  32072 non-null  object
 1   rating       32073 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 501.3+ KB


In [None]:
reviews_df.rating.value_counts()

 1    19894
-1    10697
 0     1482
Name: rating, dtype: int64

# Remove Nan Values

In [None]:
reviews_df.isnull().value_counts()

description  rating
False        False     32072
True         False         1
dtype: int64

In [None]:
# Remove Null values
reviews_df.dropna(inplace=True)

# Remove Neutral  Rating

In [None]:
reviews_df = reviews_df[reviews_df['rating'] != 0]

reviews_df.rating.value_counts()

 1    19894
-1    10696
Name: rating, dtype: int64

# Data Cleaning

In [None]:
punctuation_re = '[?؟!٪,،@#$%&*€+-£_~\“̯/=><.\۰):؛}{÷%("\'ًٌٍَُِّْ٠-٩]'

emoji_re = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", flags=re.UNICODE)


stopwordsArabic = set(stopwords.words('arabic')) - set(['لا', 'ما', 'إلا', 'ليس', 'لن', 'لم', 'دون', 'غير', 'لست'])
stopwordsArabic.update('جدا', 'الله', 'والله', 'فقط', 'صراح', 'انا', 'او')
stemmer = SnowballStemmer("arabic")

In [None]:
def clean_text(text):

  # remove punct
  no_punc = re.sub(punctuation_re, ' ', text)

  # remove duplicated letters
  no_duplicate = re.sub(r'([\u0600-\u06FF])\1+', r'\1', no_punc)

  # remove non arabic letters and emojis
  no_english = re.sub(r'[a-zA-Z?]', ' ', no_duplicate)
  # re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9]+', ' ', no_duplicate)

  # tokenize
  tokens = nltk.word_tokenize(no_english)

  #Stemming
  stemmedWords = [stemmer.stem(word) for word in tokens]

  return ' '.join(stemmedWords)

In [None]:
reviews_df['clean_text'] = reviews_df['description'].apply(lambda x: clean_text(x))
reviews_df.head()

Unnamed: 0,description,rating,clean_text
0,رائع,1,رايع
1,برنامج رائع جدا يساعد على تلبيه الاحتياجات بشكل اسرع,1,رنامج رايع جدا يساعد على تلب احتياج شكل اسرع
2,التطبيق لا يغتح دائما بيعطيني لا يوجد اتصال بالشبكة..مع انه النت عندي تمام شو الحل??,-1,تطبيق لا يغتح دايم يعط لا يوجد اتصال شبك مع انه النت عند تمام شو الحل
3,لماذا لا يمكننا طلب من ماكدونالدز؟,-1,لماذ لا يمك طلب من ماكدونالدز
4,البرنامج بيظهر كل المطاعم و مغلقه مع انها بتكون فاتحه بقاله كده اكتر من شهر,-1,برنامج يظهر كل مطاعم و مغلق مع انه تكو فاتح قال كده اكتر من شهر


# Remove Empty Reviews(Rows)

In [None]:
reviews_df.rating.value_counts()

 1    19894
-1    10696
Name: rating, dtype: int64

In [None]:
reviews_df = reviews_df[reviews_df.astype(str)['clean_text'] != '[]']
reviews_df.rating.value_counts()

 1    19894
-1    10696
Name: rating, dtype: int64

# Make Balanced Data

In [None]:
reviews_df = reviews_df.sort_values(by=['rating'])


In [None]:
reviews_df = reviews_df[:-9000]
reviews_df.rating.value_counts()

 1    10894
-1    10696
Name: rating, dtype: int64

In [None]:
reviews_df = reviews_df.reset_index(drop=True)
reviews_df

Unnamed: 0,description,rating,clean_text
0,كان عندي خصم 50 جنية و لما جيت اطلب مخصمليش حاجة و في ضريبة 25 جنية انا كنت عامل حسابي أن بعد الخصم هيبقي 140 جنية بس مكنش في خصم و التطبيق كان قائلي أن في خصم مخصمليش حاجة و كمان في ضريبة 25 جنية و توصيل 10 جنية طلع الحساب 220,-1,كان عند خصم جن و لما جيت اطلب مخصمليش حاج و في ضريب جن انا كنت عامل حساب أن بعد خصم هيبق جن بس مكنش في خصم و تطبيق كان قايل أن في خصم مخصمليش حاج و كما في ضريب جن و توصيل جن طلع حساب
1,تطبيق كان افضل ومن جديد التوصيل بطيء جدا,-1,تطبيق كان افضل ومن جديد توصيل طيء جدا
2,سي جداً,-1,سي جدا
3,تطبيق غير مفيد لسكان الشروق لانه لا يوجد فيه غير ٤ سوبرماركت ودائما مشغولين,-1,تطبيق غير مفيد لسكا شروق لان لا يوجد فيه غير سوبرمار دايم مشغول
4,برنامج يقرف صراحه ربنا,-1,رنامج يقرف صراح ربن
...,...,...,...
21585,البرنامج جيد جدا ممتاز هو يوصل الطلب في معاد مظبوط,1,برنامج جيد جدا متاز هو يوصل طلب في معاد مظبوط
21586,I love you pink,1,
21587,جيد جدا.. لكن في مشكلة لايظهر لي الاخيارات محدودة مثلا سامح مول وكريم مول وسيفوي غير موجود وانا بحاجاهم في هذا التطبيق لايظهر لي الا كم مطعم وخمس محلات حلويات فقط ومحمصين فقط لاغير ارجو حل المشكلة ضروري جدا,1,جيد جدا لكن في مشكل ايظهر لي اخيار محدود مثل سامح مول كريم مول سيف غير موجود وان حاج في هذا تطبيق ايظهر لي الا كم مطعم خمس محل حلو فقط محمص فقط اغير ارج حل مشكل ضرور جدا
21588,حلو 🗣️,1,حلو 🗣️


# Extract Features

## using TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
tdidf_vect= TfidfVectorizer()
tdidf = tdidf_vect.fit_transform(reviews_df['clean_text'])


features_tdidf = pd.DataFrame(tdidf.toarray())
features_tdidf.columns = tdidf_vect.get_feature_names_out()
features_tdidf

Unnamed: 0,²¹³,²ش,ßãšس,ùů,čğīğ,ηξ,ιδέα,να,рщй,ьииь,...,ﻟﻴﺶ,ﻣﺎﻛﻮ,ﻵحق,ﻷكل,ﻷنه,ﻻزم,ﻻكن,ﻻيتم,ﻻيظهر,ﻻيوجد
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# count_vect= CountVectorizer()
# count = count_vect.fit_transform(reviews_df['clean_text'])


# features_count_vect = pd.DataFrame(count.toarray())
# features_count_vect.columns = count_vect.get_feature_names_out()
# features_count_vect

## Other Features


### Type of emojies

In [None]:
def extractEmoji(text):
  return str(set([char for char in text if re.match(emoji_re, char)]))

reviews_df['type_of_emoji'] = reviews_df['description'].apply(lambda x: extractEmoji(x))

tdidf_e = TfidfVectorizer()
tdidf = tdidf_e.fit_transform(reviews_df['type_of_emoji'])
number_of_emoji = pd.DataFrame(tdidf.toarray())
number_of_emoji.columns = tdidf_e.get_feature_names_out()

### Number of Words

In [None]:
reviews_df['number_of_words'] = reviews_df['description'].apply(lambda x: len(x.split(" ")))

### Number of Punctuation

In [None]:
def count(text):
  count = sum([1 for char in text if char in ['!', '?', '؟']])
  return count

reviews_df['number_of_?!'] = reviews_df['description'].apply(lambda x: count(x))

# print(reviews_df['number_of_?!'].value_counts())

# reviews_df[reviews_df['rating'] == -1].describe()

# reviews_df.head(1000)

### Number of Emojis

In [None]:
def count(x):
  count = sum([1 for i in x if re.match(emoji_re, i)])
  return count

reviews_df['Number_of_emoji'] = reviews_df['description'].apply(lambda x: count(x))
reviews_df.head()

# reviews_df[reviews_df['count_emoji'] > 0].rating.value_counts()

Unnamed: 0,description,rating,clean_text,type_of_emoji,number_of_words,number_of_?!,Number_of_emoji
0,كان عندي خصم 50 جنية و لما جيت اطلب مخصمليش حاجة و في ضريبة 25 جنية انا كنت عامل حسابي أن بعد الخصم هيبقي 140 جنية بس مكنش في خصم و التطبيق كان قائلي أن في خصم مخصمليش حاجة و كمان في ضريبة 25 جنية و توصيل 10 جنية طلع الحساب 220,-1,كان عند خصم جن و لما جيت اطلب مخصمليش حاج و في ضريب جن انا كنت عامل حساب أن بعد خصم هيبق جن بس مكنش في خصم و تطبيق كان قايل أن في خصم مخصمليش حاج و كما في ضريب جن و توصيل جن طلع حساب,set(),52,0,0
1,تطبيق كان افضل ومن جديد التوصيل بطيء جدا,-1,تطبيق كان افضل ومن جديد توصيل طيء جدا,set(),8,0,0
2,سي جداً,-1,سي جدا,set(),2,0,0
3,تطبيق غير مفيد لسكان الشروق لانه لا يوجد فيه غير ٤ سوبرماركت ودائما مشغولين,-1,تطبيق غير مفيد لسكا شروق لان لا يوجد فيه غير سوبرمار دايم مشغول,set(),14,0,0
4,برنامج يقرف صراحه ربنا,-1,رنامج يقرف صراح ربن,set(),4,0,0


In [None]:
# [reviews_df['number_of_words'], reviews_df['number_of_?!'], reviews_df['Number_of_emoji'],

# Machine Learning Model


## Final Features

In [None]:
labels = reviews_df['rating']
# , reviews_df['number_of_words'], reviews_df['number_of_?!'], number_of_emoji
features = pd.concat( [features_tdidf], axis=1)
features.shape

(21590, 13666)

## Split data

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(16192, 13666) (5398, 13666) (16192,) (5398,)


## Selection Method

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_jobs=-1)
model.fit(x_train, y_train)
print(model.score(x_test, y_test))

0.8525379770285291


### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
NBmodel = GaussianNB()
NBmodel.fit(x_train, y_train)
print(NBmodel.score(x_test, y_test))

0.6798814375694702


In [None]:
from sklearn.naive_bayes import MultinomialNB
NBmodel = MultinomialNB()
NBmodel.fit(x_train, y_train)
print(NBmodel.score(x_test, y_test))

0.8547610226009633


### SVC

In [None]:
from sklearn import model_selection, naive_bayes, svm

SVM_model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM_model.fit(x_train, y_train)
print(SVM_model.score(x_test, y_test))

0.867728788440163
