# Import Important Libraries

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# Upload Datasets

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
Train_df = pd.read_csv('/content/drive/MyDrive/nlp_project/data_cleaning/preprocessed_train.csv')
Train_df.drop(columns='Unnamed: 0', inplace=True)
Train_df

Unnamed: 0,text,label
0,فين الذكور يا رجالة,LB
1,هه انت تشجع فنادي مش حاضرهم شايلين شامبيون اسي...,LY
2,فكرة حلوة و ممكن رسالة ع الموبايل,EG
3,ياا يالميدان كنت فين من زماان يارب كملها على خ...,EG
4,هما اعلامي التوك شو راحوا فين كله اعادة او مش ...,EG
...,...,...
118175,والله تذكرت يوم مات الحسن التاني كان من اول ال...,MA
118176,ممكن بس حد يدلني هو محماا علي ده ازاي حصل علي ...,EG
118177,ساكت ليه يا مجدي قوول وفضفض,EG
118178,سامعة الصراخ اللي طالع من جوه متخافيش دول صحاب...,EG


In [7]:
Test_df = pd.read_csv('/content/drive/MyDrive/nlp_project/data_cleaning/preprocessed_test.csv')
Test_df.drop(columns='Unnamed: 0', inplace=True)
Test_df

Unnamed: 0,text,label
0,احنا بيقنا الصبح استاذ مجدي يومك بيضحك,EG
1,يا مشحبطيني يا اني,LB
2,زي النهارده السادات كان يشعر بالحرب مع مبارك و...,EG
3,عطاهم عصيير في كاس كبيير,MA
4,ولا ما سافل وحقير الا انتم عايزين الراجل يتسجن...,EG
...,...,...
29540,بعلمك فيه أزمة بعد كام ساعة بيتفقوا شوو هالمسخ...,LB
29541,حاجات ممكن تغيب عنك بسيطة وسهلة معناها لكن باز...,LY
29542,وبعدين ليا هلبة وقت ممشيتش لهون,LY
29543,فنان هايل وكوميدي من الدرجة الاولي وطبيعي جدا ...,EG


# TF-IDF

In [9]:
print(Train_df['text'].isna().sum())
print(Test_df['text'].isna().sum())

157
37


In [13]:
print("Rows with NaN values in Train_df:")
Train_df[Train_df['text'].isna()]

Rows with NaN values in Train_df:


Unnamed: 0,text,label
152,,EG
695,,SD
1095,,EG
1657,,EG
4094,,LB
...,...,...
113092,,LY
113781,,EG
114711,,EG
115627,,EG


In [14]:
print("Rows with NaN values in Test_df:")
Test_df[Test_df['text'].isna()]

Rows with NaN values in Test_df:


Unnamed: 0,text,label
1204,,EG
1614,,LY
3004,,EG
3924,,EG
5118,,EG
5771,,LY
6363,,SD
6783,,MA
6824,,EG
7415,,EG


In [15]:
# Drop rows with NaN values in the 'text' column of Train_df
Train_df = Train_df.dropna(subset=['text'])

# Drop rows with NaN values in the 'text' column of Test_df
Test_df = Test_df.dropna(subset=['text'])

In [16]:
tfidf = TfidfVectorizer(use_idf=True)
X_train_counts = tfidf.fit_transform(Train_df['text'])
X_test_counts = tfidf.transform(Test_df['text'])

# ML Models

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('logreg', LogisticRegression())  # Logistic Regression model
])

param_grid = {
    'tfidf__max_df': [0.9, 0.95],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'logreg__solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'logreg__C': [0.1, 1.0, 10.0]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(Train_df['text'], Train_df['label'])

# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'logreg__C': 10.0, 'logreg__solver': 'newton-cg', 'tfidf__max_df': 0.9, 'tfidf__ngram_range': (1, 2)}
Best cross-validation score: 0.83


In [17]:
clf = LogisticRegression(random_state=42).fit(X_train_counts, Train_df['label'])
y_pred=clf.predict(X_test_counts)
print(confusion_matrix(Test_df['label'],y_pred))
print(classification_report(Test_df['label'],y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[10744   171   473    26    93]
 [  438  4611   399    37    35]
 [  870   259  6008    79    74]
 [  330   138   378  1413    46]
 [  698   159   414    35  1580]]
              precision    recall  f1-score   support

          EG       0.82      0.93      0.87     11507
          LB       0.86      0.84      0.85      5520
          LY       0.78      0.82      0.80      7290
          MA       0.89      0.61      0.73      2305
          SD       0.86      0.55      0.67      2886

    accuracy                           0.83     29508
   macro avg       0.84      0.75      0.78     29508
weighted avg       0.83      0.83      0.82     29508



In [23]:
clf_balance = LogisticRegression(random_state=42,class_weight='balanced',solver='newton-cg',C=10).fit(X_train_counts, Train_df['label'])
y_pred=clf_balance.predict(X_test_counts)
print(confusion_matrix(Test_df['label'],y_pred))
print(classification_report(Test_df['label'],y_pred))

[[10113   275   560   160   399]
 [  262  4703   318   107   130]
 [  526   313  5993   234   224]
 [  163   130   218  1701    93]
 [  393   142   253    70  2028]]
              precision    recall  f1-score   support

          EG       0.88      0.88      0.88     11507
          LB       0.85      0.85      0.85      5520
          LY       0.82      0.82      0.82      7290
          MA       0.75      0.74      0.74      2305
          SD       0.71      0.70      0.70      2886

    accuracy                           0.83     29508
   macro avg       0.80      0.80      0.80     29508
weighted avg       0.83      0.83      0.83     29508



In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree= dtree.fit(X_train_counts,Train_df['label'])
predictions = dtree.predict(X_test_counts)
print(confusion_matrix(Test_df['label'], predictions))
print(classification_report(Test_df['label'], predictions))

[[8869  602 1339  202  515]
 [ 868 3596  745  137  177]
 [1548  721 4432  309  290]
 [ 419  207  534 1005  143]
 [ 937  274  470   99 1107]]
              precision    recall  f1-score   support

          EG       0.70      0.77      0.73     11527
          LB       0.67      0.65      0.66      5523
          LY       0.59      0.61      0.60      7300
          MA       0.57      0.44      0.50      2308
          SD       0.50      0.38      0.43      2887

    accuracy                           0.64     29545
   macro avg       0.61      0.57      0.58     29545
weighted avg       0.64      0.64      0.64     29545



In [None]:
from sklearn.ensemble import StackingClassifier

level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('dtree', DecisionTreeClassifier()))
level1 =LogisticRegression()
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=3)
model.fit(X_train_counts,Train_df['label'])
yhat = model.predict(X_test_counts)
print(confusion_matrix(Test_df['label'],yhat))

print(classification_report(Test_df['label'],yhat))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[[10517   221   532    68   189]
 [  377  4604   375    72    95]
 [  722   287  5940   195   156]
 [  265    98   292  1584    69]
 [  521   165   353    69  1779]]
              precision    recall  f1-score   support

          EG       0.85      0.91      0.88     11527
          LB       0.86      0.83      0.84      5523
          LY       0.79      0.81      0.80      7300
          MA       0.80      0.69      0.74      2308
          SD       0.78      0.62      0.69      2887

    accuracy                           0.83     29545
   macro avg       0.81      0.77      0.79     29545
weighted avg       0.83      0.83      0.82     29545



# save the best model

In [26]:
joblib.dump(clf_balance, 'logistic_regression_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# reload the model and test it

In [27]:
%%capture
!pip install tnkeeh

In [28]:
import tnkeeh as tn
import re

def predict_label(text):

    # text preprocessing
    cleander = tn.Tnkeeh(remove_diacritics=True,
                     remove_html_elements=True,
                     remove_twitter_meta=True,
                     remove_links=True,
                     remove_english=True,
                     remove_repeated_chars=True,
                     remove_long_words=True,
                     normalize=True
                     )

    text = cleander.clean_raw_text(text)
    text = text[0]

    text = text.replace(r'[0-9٠-٩]', '')
    text = text.replace("؟", "")
    text = text.replace("@", "")
    text = text.replace("_", "")
    text = text.replace("-", "")

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    arabic_punctuation_pattern = r'[^\w\s\u0621-\u063A\u0641-\u064A]'
    text = re.sub(arabic_punctuation_pattern,'',text)

    text = re.sub(r'\s+', ' ', text).strip()

    # Load the model and the vectorizer
    clf_balance = joblib.load('logistic_regression_model.pkl')
    tfidf = joblib.load('tfidf_vectorizer.pkl')

    # Transform the input text
    text_transformed = tfidf.transform([text])

    # Predict the label
    predicted_label = clf_balance.predict(text_transformed)

    return predicted_label[0]

# Example prediction
text = "يازول"
predicted_label = predict_label(text)
print(f"The predicted label for '{text}' is: {predicted_label}")

The predicted label for 'يازول' is: SD
