In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from pyarabic.araby import strip_tashkeel, strip_tatweel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('Emotional-Tone-Dataset.csv')
df

Unnamed: 0,ID,TWEET,LABEL
0,1,الاوليمبياد الجايه هكون لسه ف الكليه ..,none
1,2,عجز الموازنه وصل ل93.7 % من الناتج المحلي يعني...,anger
2,3,كتنا نيله ف حظنا الهباب xD,sadness
3,4,جميعنا نريد تحقيق اهدافنا لكن تونس تالقت في حر...,joy
4,5,الاوليمبياد نظامها مختلف .. ومواعيد المونديال ...,none
...,...,...,...
10060,10061,2222: يلا يا جماعه حفله عمرو دياب خلصت نريح شو...,sadness
10061,10062,Mohamed5: اييييه دااا 😲😲 اوزيييل❤,surprise
10062,10063,عملتلها ريتويت بمناسبه ساره بتاعه الاوليمبياد 😃,none
10063,10064,وعليك قبلنا يانجم النجوم ياعندليب الحب والاحساس,joy


# **PART ONE:**
## Text Preprocessing

In [3]:
stop_words = set(stopwords.words('arabic'))

def normalize_repeated_letters(text):

    return re.sub(r'(.)\1{2,}', r'\1', text)

def preprocess(text):
    
    text = normalize_repeated_letters(text)

    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)

    
    text = strip_tashkeel(strip_tatweel(text))


    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)


    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    return ' '.join(tokens)

    
df['clean_text'] = df[' TWEET'].astype(str).apply(preprocess)

In [4]:
df['clean_text']

0                       الاوليمبياد الجايه هكون لسه الكليه
1        عجز الموازنه وصل الناتج المحلي يعني لسه اقل نف...
2                                    كتنا نيله حظنا الهباب
3        جميعنا نريد تحقيق اهدافنا تونس تالقت حراسه المرمي
4        الاوليمبياد نظامها مختلف ومواعيد المونديال مكا...
                               ...                        
10060    يلا جماعه حفله عمرو دياب خلصت نريح شويه ونبدا ...
10061                                            ايه اوزيل
10062        عملتلها ريتويت بمناسبه ساره بتاعه الاوليمبياد
10063      وعليك قبلنا يانجم النجوم ياعندليب الحب والاحساس
10064    يطلع ننهم سيء ووضيع خساسه العالم تجمعت الايران...
Name: clean_text, Length: 10065, dtype: object

## Label Encoding and Data Splitting

In [5]:
le = LabelEncoder()
df['label'] = le.fit_transform(df[' LABEL'])

X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
df['label']

0        4
1        0
2        5
3        2
4        4
        ..
10060    5
10061    6
10062    4
10063    2
10064    0
Name: label, Length: 10065, dtype: int32

In [7]:
'''
#We treated the none label as NEUTRAL/NO sentiment
0 -> anger
1 -> fear
2 -> joy
3 -> love
4 -> neutral (none)
5 -> sadness
6 -> surprise
7 -> sympathy
'''
from sklearn.preprocessing import LabelEncoder


labels = df[' LABEL']

for code, label in enumerate(le.classes_):
    print(f"{code} -> {label}")

0 -> anger
1 -> fear
2 -> joy
3 -> love
4 -> none
5 -> sadness
6 -> surprise
7 -> sympathy


# **PART TWO:**
## Text Representations
### **1) TF-IDF**

In [8]:
tfidf = TfidfVectorizer()
X_tfidf_train = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)

### **GridSearchCV for SVM**

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear']
}
grid = GridSearchCV(SVC(), param_grid, scoring='f1_weighted', cv=3)
grid.fit(X_tfidf_train, y_train)

print("Best Params:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)

Best Params: {'C': 1, 'kernel': 'linear'}
Best F1 Score: 0.6275371105671033


In [10]:
'''print(X_tfidf_train.toarray())  # Print training TF-IDF matrix
print(X_tfidf_test.toarray())   # Print test TF-IDF matrix
'''

print(X_tfidf_train.shape)  
print(tfidf.get_feature_names_out()[20:50]) 

(8052, 28312)
['ابا' 'اباء' 'اباح' 'اباده' 'اباظه' 'ابالغ' 'ابالي' 'ابتداء' 'ابتداءي'
 'ابتدايي' 'ابتدوا' 'ابتدي' 'ابتديت' 'ابتديتوا' 'ابتدينا' 'ابتزاز'
 'ابتسام' 'ابتسامتك' 'ابتسامه' 'ابتسسم' 'ابتسم' 'ابتسمت' 'ابتسمنا'
 'ابتسمي' 'ابتعدي' 'ابتكارات' 'ابتلاء' 'ابتلانا' 'ابتهاج' 'ابتهاجا']


### **2) Word2Vec (Using AraVec)**

In [8]:
import zipfile
from gensim.models import Word2Vec

# Unzip the file
with zipfile.ZipFile('tweet_cbow_300.zip', 'r') as zip_ref:
    zip_ref.extractall('tweet_cbow_300')

model = Word2Vec.load('tweet_cbow_300/tweets_cbow_300')

def get_w2v_avg(texts, model, dim=300):
    import numpy as np
    vectors = []
    for text in texts:
        tokens = text.split()
        vecs = [model.wv[w] for w in tokens if w in model.wv]
        if vecs:
            vectors.append(np.mean(vecs, axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

X_w2v_train = get_w2v_avg(X_train, model)
X_w2v_test = get_w2v_avg(X_test, model)

In [12]:
vector = model.wv['الاوليمبياد']
vector

array([ 4.35434431e-01, -1.51483759e-01,  3.37284058e-01, -3.55382979e-01,
       -4.92361069e-01, -2.73032784e-01,  1.21263318e-01, -3.40679824e-01,
       -6.83230311e-02, -1.00465231e-01, -5.51082909e-01, -8.98964144e-03,
       -1.08159799e-02, -4.55033660e-01, -8.59628201e-01,  4.73581962e-02,
        6.12171181e-02, -2.88522780e-01, -1.80102289e-01, -1.89193189e-01,
       -3.72576155e-02,  5.61842263e-01,  1.00205541e+00,  4.72833127e-01,
       -2.90700436e-01,  2.40467682e-01,  3.10135603e-01,  3.69511023e-02,
       -1.52749002e-01, -2.69441158e-01,  1.42771110e-01,  3.89136195e-01,
        2.73712358e-04,  1.88248217e-01, -1.22218125e-01, -1.39235482e-01,
       -4.63970929e-01, -1.37733489e-01, -4.95946817e-02,  2.15869486e-01,
        4.50125962e-01, -3.48630105e-03, -1.53986916e-01,  2.13411510e-01,
       -8.41119215e-02,  2.67761022e-01,  6.97255507e-02,  1.48830786e-01,
        1.70076355e-01, -4.53415126e-01, -3.15578997e-01, -3.15471798e-01,
       -1.07801192e-01, -

### **3) BoW**

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
bow = CountVectorizer()
X_bow_train = bow.fit_transform(X_train)
X_bow_test = bow.transform(X_test)

In [14]:
# Total frequency of each word across all tweets (corpus-wide)

row = 84

nonzero_indices = X_bow_train[row].nonzero()[1]

features = bow.get_feature_names_out()
counts = X_bow_train[row, nonzero_indices].toarray().flatten()

for feat, count in zip(features[nonzero_indices], counts):
    print(f"{feat}: {count}")

الله: 1
دول: 1
النظام: 2
سوريا: 1
فالبدايه: 1
شعب: 3
يريد: 2
اصلاح: 1
اسقاط: 1
يدفع: 1
ثمن: 1
صراعات: 1
واقوام: 1
عونهم: 1


# **PART THREE:**

In [15]:
#TF-IDF
'''
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

models = {
    "Naive Bayes": MultinomialNB(),
    "SVM (RBF Kernel)": SVC(kernel='rbf'), # rbf: Radial Basis Function Kernal / Gaussian Kernal (not linearly separable)
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

for name, model in models.items():
    print(f"\n{name} with TF-IDF")
    model.fit(X_tfidf_train, y_train)
    y_pred = model.predict(X_tfidf_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))


#SVM Based on the report
'''
# Using Grid-Search tuning

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

models = {
    "Naive Bayes": MultinomialNB(),
    "SVM (Linear Kernel, GridSearchCV)": SVC(C=1, kernel='linear'), #C is the regularization parameter. 
                                                                    #It controls how much you want to avoid misclassifying each training example.
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

for name, model in models.items():
    print(f"\n{name} with TF-IDF")
    model.fit(X_tfidf_train, y_train)
    y_pred = model.predict(X_tfidf_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))


Naive Bayes with TF-IDF
              precision    recall  f1-score   support

       anger       0.59      0.66      0.63       276
        fear       0.78      0.89      0.83       259
         joy       0.59      0.29      0.39       268
        love       0.66      0.67      0.67       250
        none       0.42      0.94      0.58       307
     sadness       0.62      0.22      0.33       258
    surprise       0.85      0.17      0.28       201
    sympathy       0.81      0.82      0.81       194

    accuracy                           0.60      2013
   macro avg       0.67      0.58      0.57      2013
weighted avg       0.65      0.60      0.57      2013


SVM (Linear Kernel, GridSearchCV) with TF-IDF
              precision    recall  f1-score   support

       anger       0.52      0.71      0.60       276
        fear       0.99      0.88      0.93       259
         joy       0.53      0.40      0.46       268
        love       0.66      0.61      0.63       250
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
gnb = GaussianNB()
gnb.fit(X_w2v_train, y_train)
y_pred = gnb.predict(X_w2v_test)
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0     0.5134    0.6268    0.5644       276
           1     0.6420    0.6371    0.6395       259
           2     0.4267    0.2388    0.3062       268
           3     0.5443    0.6880    0.6078       250
           4     0.4562    0.7459    0.5661       307
           5     0.2418    0.0853    0.1261       258
           6     0.3270    0.2587    0.2889       201
           7     0.6119    0.6340    0.6228       194

    accuracy                         0.4968      2013
   macro avg     0.4704    0.4893    0.4652      2013
weighted avg     0.4696    0.4968    0.4673      2013

Confusion Matrix:


NameError: name 'confusion_matrix' is not defined

In [16]:
#Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000), # We tried iter [1000-5000] but still the same result
    "SVM (RBF Kernel)": SVC(kernel='rbf'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

for name, model in models.items():
    print(f"\n{name} with Word2Vec")
    model.fit(X_w2v_train, y_train)
    y_pred = model.predict(X_w2v_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))


#Random Forest based on the report


Logistic Regression with Word2Vec
              precision    recall  f1-score   support

       anger       0.63      0.71      0.67       276
        fear       0.89      0.88      0.88       259
         joy       0.55      0.49      0.52       268
        love       0.68      0.64      0.66       250
        none       0.62      0.72      0.66       307
     sadness       0.50      0.45      0.47       258
    surprise       0.44      0.42      0.43       201
    sympathy       0.81      0.78      0.80       194

    accuracy                           0.64      2013
   macro avg       0.64      0.64      0.64      2013
weighted avg       0.64      0.64      0.64      2013


SVM (RBF Kernel) with Word2Vec
              precision    recall  f1-score   support

       anger       0.63      0.74      0.68       276
        fear       0.95      0.82      0.88       259
         joy       0.53      0.52      0.52       268
        love       0.75      0.68      0.71       250
        non

In [17]:
#BoW

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

models = {
    "Naive Bayes": MultinomialNB(),
    "SVM (RBF Kernel)": SVC(kernel='rbf'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

for name, model in models.items():
    print(f"\n{name} with BoW")
    model.fit(X_bow_train, y_train)
    y_pred = model.predict(X_bow_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))


#Random Forest based on the report


Naive Bayes with BoW
              precision    recall  f1-score   support

       anger       0.64      0.67      0.65       276
        fear       0.78      0.90      0.83       259
         joy       0.60      0.30      0.40       268
        love       0.66      0.70      0.68       250
        none       0.47      0.91      0.62       307
     sadness       0.51      0.26      0.34       258
    surprise       0.71      0.26      0.38       201
    sympathy       0.73      0.87      0.80       194

    accuracy                           0.62      2013
   macro avg       0.64      0.61      0.59      2013
weighted avg       0.63      0.62      0.59      2013


SVM (RBF Kernel) with BoW
              precision    recall  f1-score   support

       anger       0.56      0.60      0.58       276
        fear       0.99      0.86      0.92       259
         joy       0.36      0.50      0.42       268
        love       0.77      0.51      0.61       250
        none       0.57      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **PART FOUR:**

In [21]:
'''from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelBinarizer

# One-hot encode the labels for multi-class output
lb = LabelBinarizer()  # y_train contains labels
y_train_oh = lb.fit_transform(y_train)
y_test_oh = lb.transform(y_test)

# Use Word2Vec averaged vectors as input
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(300,)))  # 300 = Word2Vec vector size
model.add(Dense(len(le.classes_), activation='softmax'))      # Output layer

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_w2v_train, y_train_oh, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate
y_pred_ffnn = model.predict(X_w2v_test).argmax(axis=1)
from sklearn.metrics import classification_report
print("\nFeed-Forward NN (Keras) Evaluation:")
print(classification_report(y_test, y_pred_ffnn, target_names=le.classes_))'''

##_____________________________________________________________________________________________________________________________##

'''from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelBinarizer

# One-hot encode the labels for multi-class output
lb = LabelBinarizer()
y_train_oh = lb.fit_transform(y_train)
y_test_oh = lb.transform(y_test)

# Define the model with multiple hidden layers and dropout for regularization
model = Sequential()
model.add(Dense(512, activation='ReLU', input_shape=(300,)))
model.add(Dropout(0.3))
model.add(Dense(256, activation='ReLU'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='ReLU'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='ReLU'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='ReLU'))
model.add(Dropout(0.2))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  # default learning rate ~0.001

model.fit(X_w2v_train, y_train_oh, epochs=15, batch_size=32, validation_split=0.1)

# Evaluate
y_pred_ffnn = model.predict(X_w2v_test).argmax(axis=1)
from sklearn.metrics import classification_report
print("\nFeed-Forward NN (Keras) Evaluation:")
print(classification_report(y_test, y_pred_ffnn, target_names=le.classes_))'''

##_____________________________________________________________________________________________________________________________##

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import class_weight
import numpy as np

# One-hot encode labels
lb = LabelBinarizer()
y_train_oh = lb.fit_transform(y_train)
y_test_oh = lb.transform(y_test)

# Optional: compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))

# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(300,)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(256))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(128))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(64))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(len(lb.classes_), activation='softmax'))

# Compile
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Add early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train
model.fit(
    X_w2v_train, y_train_oh,
    validation_split=0.1,
    epochs=30,
    batch_size=32,
    callbacks=[early_stop],
    class_weight=class_weights
)

# Evaluate
y_pred_ffnn = model.predict(X_w2v_test).argmax(axis=1)

from sklearn.metrics import classification_report
print("\nFeed-Forward NN (Keras) Evaluation:")
print(classification_report(y_test, y_pred_ffnn, target_names=[str(cls) for cls in lb.classes_]))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30

Feed-Forward NN (Keras) Evaluation:
              precision    recall  f1-score   support

           0       0.65      0.69      0.67       276
           1       0.93      0.88      0.91       259
           2       0.57      0.45      0.50       268
           3       0.73      0.69      0.71       250
           4       0.56      0.82      0.66       307
           5       0.56      0.45      0.50       258
           6       0.48      0.41      0.44       201
           7       0.82      0.82      0.82       194

    accuracy                           0.66      2013
   macro avg       0.66      0.65      0.65      2013
weighted avg       0.66      0.66      0.65      2013



# **PART FIVE:**

In [31]:
'''import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
max_seq_len = 100
X_pad = pad_sequences(X_seq, maxlen=max_seq_len)

# 2. Prepare embedding matrix from Word2Vec
word_index = tokenizer.word_index
num_words = min(5000, len(word_index) + 1)
embedding_dim = w2v_model.vector_size  # usually 300

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

# 3. One-hot encode labels
lb = LabelBinarizer()
y_oh = lb.fit_transform(y)

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_oh, test_size=0.2, random_state=42)

# 5. Build LSTM model
model = Sequential()
model.add(Embedding(
    input_dim=num_words,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_seq_len,
    trainable=False  # freeze embeddings To preserve the original semantic information from high-quality 
                    # pre-trained embeddings like Word2Vec, GloVe, or AraVec & reduce training time 
                    # and prevent overfitting, especially with small datasets.
))

model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(len(lb.classes_), activation='softmax'))

# 6. Compile
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 7. Callbacks for early stopping and saving best model
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint('best_lstm_model.h5', save_best_only=True)
]

# 8. Train
model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks
)

# 9. Predict & evaluate
y_pred = model.predict(X_test).argmax(axis=1)
y_true = y_test.argmax(axis=1)

print("\nLSTM Evaluation:")
print(classification_report(y_true, y_pred, target_names=[str(c) for c in lb.classes_]))

'''
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

# 1. Load your pre-trained Word2Vec model
w2v_model = Word2Vec.load('tweet_cbow_300/tweets_cbow_300')

# 2. Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
max_seq_len = 100
X_pad = pad_sequences(X_seq, maxlen=max_seq_len)

# 3. Prepare embedding matrix using w2v_model
word_index = tokenizer.word_index
num_words = min(5000, len(word_index) + 1)
embedding_dim = w2v_model.vector_size  # 300 for your CBOW model

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

# 4. One-hot encode labels
lb = LabelBinarizer()
y_oh = lb.fit_transform(y)

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_oh, test_size=0.2, random_state=42)

# 6. Build LSTM model
model = Sequential()
model.add(Embedding(
    input_dim=num_words,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_seq_len,
    trainable=False 
))
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(len(lb.classes_), activation='softmax'))

# 7. Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 8. Setup callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint('best_lstm_model.h5', save_best_only=True)
]

# 9. Train model
model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks
)

# 10. Predict and evaluate
y_pred = model.predict(X_test).argmax(axis=1)
y_true = y_test.argmax(axis=1)

print("\nLSTM-Improved Evaluation:")
print(classification_report(y_true, y_pred, target_names=[str(c) for c in lb.classes_]))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

LSTM-Improved Evaluation:
              precision    recall  f1-score   support

           0       0.61      0.75      0.67       276
           1       0.96      0.92      0.94       259
           2       0.51      0.47      0.49       268
           3       0.71      0.65      0.68       250
           4       0.65      0.77      0.71       307
           5       0.44      0.41      0.42       258
           6       0.47      0.35      0.40       201
           7       0.85      0.85      0.85       194

    accuracy                           0.65      2013
   macro avg       0.65      0.65      0.65      2013
weighted avg       0.65      0.65      0.65      2013



# BiLSTM

In [32]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import zipfile
from gensim.models import Word2Vec

# Load Pre-trained Word2Vec Model
with zipfile.ZipFile('tweet_cbow_300.zip', 'r') as zip_ref:
    zip_ref.extractall('tweet_cbow_300')

w2v_model = Word2Vec.load('tweet_cbow_300/tweets_cbow_300')
embedding_dim = 300

# Tokenize and Pad Sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=150)

# Build Embedding Matrix
word_index = tokenizer.word_index
num_words = min(10000, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Encode Labels
lb = LabelBinarizer()
y_oh = lb.fit_transform(y)

# Train-Test Split
X_train_pad, X_test_pad, y_train_oh, y_test_oh = train_test_split(X_pad, y_oh, test_size=0.2, random_state=42)

# Define BiLSTM Model
model = Sequential()
model.add(Embedding(
    input_dim=num_words,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=150,
    trainable=False  
))
model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(lb.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Training with Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint('best_bilstm_model.h5', save_best_only=True)
]

model.fit(
    X_train_pad, y_train_oh,
    epochs=5,
    batch_size=64,
    validation_split=0.2,
    callbacks=callbacks
)

# Evaluation
y_pred = model.predict(X_test_pad).argmax(axis=1)
y_true = y_test_oh.argmax(axis=1)

print("\nBiLSTM Model Evaluation:")
print(classification_report(y_true, y_pred, target_names=[str(cls) for cls in lb.classes_]))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

BiLSTM Model Evaluation:
              precision    recall  f1-score   support

           0       0.62      0.72      0.67       276
           1       0.94      0.89      0.92       259
           2       0.53      0.44      0.48       268
           3       0.70      0.66      0.68       250
           4       0.59      0.85      0.70       307
           5       0.50      0.40      0.45       258
           6       0.50      0.35      0.41       201
           7       0.84      0.85      0.84       194

    accuracy                           0.65      2013
   macro avg       0.65      0.65      0.64      2013
weighted avg       0.65      0.65      0.64      2013



# **AraBERT**

In [29]:
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from transformers import InputExample, InputFeatures
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd

# Load tokenizer and model
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(set(y))  # Replace `y` with your label list

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Tokenization
def convert_examples(texts, labels):
    return [
        InputExample(guid=str(i), text_a=text, label=label)
        for i, (text, label) in enumerate(zip(texts, labels))
    ]

def convert_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []

    for e in examples:
        inputs = tokenizer.encode_plus(
            e.text_a,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="tf"
        )
        input_ids = inputs["input_ids"][0]
        attention_mask = inputs["attention_mask"][0]
        features.append(({
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }, e.label))

    def gen():
        for f in features:
            yield f

    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                "input_ids": tf.TensorSpec(shape=(128,), dtype=tf.int32),
                "attention_mask": tf.TensorSpec(shape=(128,), dtype=tf.int32)
            },
            tf.TensorSpec(shape=(), dtype=tf.int64)
        )
    )

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

train_examples = convert_examples(X_train, y_train)
test_examples = convert_examples(X_test, y_test)

train_dataset = convert_to_tf_dataset(train_examples, tokenizer).shuffle(100).batch(16)
test_dataset = convert_to_tf_dataset(test_examples, tokenizer).batch(16)

# Load model
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Compile
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train
model.fit(train_dataset, epochs=5, validation_data=test_dataset)

# Evaluate
y_pred_probs = model.predict(test_dataset).logits
y_pred = tf.argmax(y_pred_probs, axis=1).numpy()
print(classification_report(y_test, y_pred, target_names=le.classes_))


  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceCla

Epoch 1/5


AttributeError: in user code:

    File "C:\Users\ragha\anaconda3\envs\MyEnv\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\ragha\anaconda3\envs\MyEnv\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\ragha\anaconda3\envs\MyEnv\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\ragha\anaconda3\envs\MyEnv\lib\site-packages\transformers\modeling_tf_utils.py", line 1630, in train_step
        x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)

    AttributeError: module 'keras.utils' has no attribute 'unpack_x_y_sample_weight'
