In [7]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import shap
import lime.lime_text
from gensim.models import Word2Vec
from transformers import BertTokenizer, TFBertModel
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
print(" All required libraries installed succesfully!!!!...")

 All required libraries installed succesfully!!!!...


In [11]:
# Load datasets
labeled_data = pd.read_csv('labeled_data.csv')
emoji_sentiment = pd.read_excel('emoji_sentiment.xlsx')
print("All datasets loaded sucessfully")

All datasets loaded sucessfully


In [13]:
# Data Preprocessing
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

In [15]:
labeled_data['clean_text'] = labeled_data['tweet'].apply(clean_text)  # Fixed column name

In [17]:
# Convert labels to numerical values
label_encoder = LabelEncoder()
labeled_data['label'] = label_encoder.fit_transform(labeled_data['class'])

In [19]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(labeled_data['clean_text'], labeled_data['label'], test_size=0.2, random_state=42)

In [21]:
# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [23]:
# Word Embeddings (Word2Vec)
tokenized_texts = [text.split() for text in X_train]
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

In [25]:
# BERT Embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [27]:
def get_bert_embeddings(texts):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="tf")
    outputs = bert_model(tokens['input_ids'])
    return outputs.last_hidden_state[:, 0, :]

In [35]:
X_train_bert = get_bert_embeddings(X_train.tolist())


ResourceExhaustedError: Exception encountered when calling layer 'LayerNorm' (type LayerNormalization).

{{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[19826,53,768] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu [Op:Mul] name: 

Call arguments received by layer 'LayerNorm' (type LayerNormalization):
  • inputs=tf.Tensor(shape=(19826, 53, 768), dtype=float32)

In [None]:
X_test_bert = get_bert_embeddings(X_test.tolist())

In [None]:
# Balancing Data with SMOTE and Undersampling
smote = SMOTE(random_state=42)
rus = RandomUnderSampler(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_tfidf, y_train)
X_train_bal, y_train_bal = rus.fit_resample(X_train_bal, y_train_bal)
print("Balanced dataset shape:", Counter(y_train_bal))

In [None]:
# Model Development (Hybrid CNN-RNN)
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=X_train_tfidf.shape[1]),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train Model
model.fit(X_train_bal.toarray(), y_train_bal, epochs=10, batch_size=32, validation_data=(X_test_tfidf.toarray(), y_test))


In [None]:
# Model Evaluation
eval_results = model.evaluate(X_test_tfidf.toarray(), y_test)
print(f"Test Accuracy: {eval_results[1]}")

In [None]:
# Explainability (SHAP & LIME)
explainer = shap.Explainer(model, X_test_tfidf.toarray())
shap_values = explainer(X_test_tfidf.toarray())
shap.summary_plot(shap_values, X_test_tfidf.toarray())

In [None]:
lime_explainer = lime.lime_text.LimeTextExplainer(class_names=label_encoder.classes_)

In [None]:
def explain_instance(text_instance):
    explanation = lime_explainer.explain_instance(text_instance, model.predict)
    explanation.show_in_notebook()


In [None]:
print("Model successfully trained and evaluated.")

In [11]:
print(labeled_data.columns)

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')
