In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

data = pd.read_excel(r"/content/AJGT.xlsx")
print(data.head())
print(data.sample(5))

   ID                                               Feed Sentiment
0   1   اربد فيها جامعات اكثر من عمان ... وفيها قد عم...  Positive
1   2   الحلو انكم بتحكوا على اساس انو الاردن ما فيه ...  Negative
2   3                            كله رائع بجد ربنا يكرمك  Positive
3   4                                 لسانك قذر يا قمامه  Negative
4   5  ​انا داشره وغير متزوجه ولدي علاقات مشبوه واحشش...  Negative
        ID                                               Feed Sentiment
1185  1186                                كن مع الله لا تبالي  Positive
197    198  الدين عباره عن اخلاق وحسن التعامل مع الناس وته...  Positive
1395  1396  معايا زميل زميل يلعن المسخره كلهم مايستحون ذووولي  Negative
120    121  اكيد طبعا حب ربنا احسن من حب اي حد وانا فعلا ع...  Positive
1400  1401                                      معلومه ممتازه  Positive


In [None]:
punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

# string.punctuation = !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~

# Arabic stop words with nltk
stop_words = stopwords.words()

arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE) 

def preprocess(text):
    
    '''
    text is an arabic string input
    
    the preprocessed text is returned
    '''
    
    #remove punctuations
    translator = str.maketrans('' ,  '', punctuations) # maps punctuations to none.

    

    text = text.translate(translator)
    
    
    # remove Tashkeel
    text = re.sub(arabic_diacritics, '', text) 
    
    #remove longation
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    

    text = ' '.join(word for word in text.split() if word not in stop_words)

    return text
  
  
data['Feed'] = data['Feed'].apply(preprocess)
print(data.head(5))

   ID                                               Feed Sentiment
0   1  اربد جامعات اكثر عمان وفيها عمان ونص لعيبه الم...  Positive
1   2   الحلو انكم بتحكوا علي اساس انو الاردن فساد سرقات  Negative
2   3                            كله راءع بجد ربنا يكرمك  Positive
3   4                                    لسانك قذر قمامه  Negative
4   5  ​انا داشره وغير متزوجه ولدي علاقات مشبوه واحشش...  Negative


In [None]:
# splitting the data into target and feature
feature = data.Feed # tweets
target = data.Sentiment # {positive, negative, neutral}

# splitting into train and tests
X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size =0.2)

# the pipe will take care of transformations needs to be done on the output of
# the vectorizer and gives it to the logistic regression model
pipe = make_pipeline(TfidfVectorizer(),
                   LogisticRegression())

# TfidfVectorizer: helps us get the significant word (the rare one) 
# (tweet_no, word_index) --> how much is it significant. 

print(pipe)

# param_grid for the hyperparamter c: regularization term.
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

# Grid Search CV: searches in the parameter_grid to get the best one.
model = GridSearchCV(pipe, param_grid)

model.fit(X_train, Y_train)


prediction = model.predict(X_test)

print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('logisticregression', LogisticRegression())])
Accuracy score is 0.85
