In [1]:
import pandas as pd
from nltk.corpus import stopwords
import string

In [2]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuations = string.punctuation

[nltk_data] Downloading package stopwords to /home/tony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preprocess(text):
    text = text.lower() 
    text = text.replace(punctuations, " ") 
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = " ".join(words)
    return text

In [4]:
def load_data(filename):
    df = pd.read_csv(filename, sep='\t', names=['text', 'label'])
    df['text'] = df['text'].apply(preprocess)
    return df['text'].values, df['label'].values

In [5]:
def load_train_data():
    X_train, y_train = load_data('NLP_ass_train.tsv')
    return X_train, y_train

In [6]:
X_train, y_train = load_train_data()

In [7]:
X_train

array(['cannot continue calling feminists rights womxn arent addressed yes sexual offences public list trans lesbian bisexual queer womxn able enter information reporting sheet gender forum',
       'nawt yall niggers ignoring',
       '<user> bit confused coz chinese ppl access twitter thn ching chong using think pakistani 🤔 🤔 🤔',
       ...,
       'macht der moslem wenn der zion gegen seinen propheten hetzt machst du wenn die roten ratten gegen deinen toten opa hetzen',
       'awful look world demographics asians fucking everywhere another betrayal white genocide real',
       'jewish globalist elite imported million muslims multiculturalize weaken america right violence lgbt black street criminals sometimes lgbt random muslim grow think start seeing organized attacks'],
      dtype=object)

In [8]:
import numpy as np

# from gensim.models import Word2Vec
import gensim.models.keyedvectors as word2vec

from tensorflow.keras import Sequential
from keras.layers import Dense, Dropout

2025-08-31 17:32:26.974115: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-08-31 17:32:27.071025: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-31 17:32:28.452961: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [9]:
# Load word2vec model
# word2vec_model = Word2Vec.load("GoogleNews-vectors-negative300.bin")
word2vec_model = word2vec.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
# Function to convert text to word2vec vectors by averaging word vectors
def text_to_word2vec(text):
    words = text.split()
    word_vectors = []
    for word in words:
        if word in word2vec_model:
            word_vectors.append(word2vec_model[word])

    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else: 
        return np.zeros(300) # vector of zeros for oov words

In [11]:
# Generate word2vec vectors for all texts in training data 
X_train_vectors = []
for text in X_train:
    X_train_vectors.append(text_to_word2vec(text)) 

In [12]:
X_train_vectors = np.array(X_train_vectors)

In [13]:
X_train_vectors[1]

array([-1.03108726e-01,  1.88954677e-02,  1.76269531e-01,  1.09049477e-01,
        1.17187500e-02, -4.64680977e-02, -1.45833328e-01, -5.02522774e-02,
        5.78613281e-02,  1.80013016e-01, -3.40169258e-02, -2.08007812e-01,
       -3.47391772e-03,  5.34261055e-02, -1.73502609e-01, -6.51041651e-03,
        3.38541679e-02, -8.49609375e-02,  1.10677080e-02, -1.39648438e-01,
        2.42024735e-01, -1.33666992e-02,  2.05078125e-01, -2.49674484e-01,
       -3.12500000e-02,  7.50325546e-02, -1.52384445e-01, -2.91341152e-02,
        4.99674492e-02, -7.46256486e-02, -2.31119785e-02, -1.12304688e-02,
       -3.16406250e-01, -1.24918623e-02, -1.51529953e-01,  3.79231758e-02,
        1.03841148e-01,  3.66210938e-04,  1.21744789e-01,  1.95312500e-01,
        1.18448891e-01, -5.84309883e-02,  3.25846344e-01, -8.63240585e-02,
        3.09244785e-02, -1.29313156e-01, -1.40462235e-01, -2.54231781e-01,
       -1.63330078e-01,  1.27604172e-01, -8.97623673e-02,  2.08007812e-01,
       -2.52278652e-02,  

In [14]:
y_train

array(['normal', 'normal', 'hatespeech', ..., 'normal', 'hatespeech',
       'offensive'], dtype=object)

In [15]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit and transform the labels
y_train_encoded = le.fit_transform(y_train)

In [16]:
y_train_encoded[:10]

array([1, 1, 0, 0, 2, 0, 0, 1, 1, 1])

In [17]:
from keras.utils import to_categorical

# Convert labels to one-hot vectors
y_train_one_hot = to_categorical(y_train_encoded)

In [18]:
y_train_one_hot[:10]

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [19]:
from keras.layers import Dropout, BatchNormalization

# Initialize the model
model = Sequential()

# Add first dense layer with 256 neurons and 'relu' activation function
model.add(Dense(256, activation='relu', input_dim=X_train_vectors.shape[1]))

# Add batch normalization layer
model.add(BatchNormalization())

# Add dropout layer
model.add(Dropout(0.5))

# Add second dense layer with 128 neurons and 'relu' activation function
model.add(Dense(128, activation='relu'))

# Add batch normalization layer
model.add(BatchNormalization())

# Add dropout layer
model.add(Dropout(0.5))

# Add third dense layer with 64 neurons and 'relu' activation function
model.add(Dense(64, activation='relu'))

# Add batch normalization layer
model.add(BatchNormalization())

# Add dropout layer
model.add(Dropout(0.5))

# Add output layer with 3 neurons (for 3 classes) and 'softmax' activation function
model.add(Dense(3, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-08-31 17:32:50.930169: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2025-08-31 17:32:50.930259: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:171] verbose logging is disabled. Rerun with verbose logging (usually --v=1 or --vmodule=cuda_diagnostics=1) to get more diagnostic output from this module
2025-08-31 17:32:50.930267: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:176] retrieving CUDA diagnostic information for host: fedora
2025-08-31 17:32:50.930271: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:183] hostname: fedora
2025-08-31 17:32:50.930613: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:190] libcuda reported version is: 575.64.5
2025-08-31 17:32:50.930640: I external/local_xla/x

In [20]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=True)

In [21]:
# Define the input shape
input_shape = (None, X_train_vectors.shape[1])

# Build the model
model.build(input_shape)

# Train the model
model.fit(X_train_vectors, y_train_one_hot, 
          epochs=100, 
          batch_size=64,
          validation_split=0.2)

Epoch 1/100
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 66ms/step - accuracy: 0.4017 - loss: 1.5540 - val_accuracy: 0.3864 - val_loss: 1.1544
Epoch 2/100
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 64ms/step - accuracy: 0.5219 - loss: 1.0680 - val_accuracy: 0.4153 - val_loss: 1.1236
Epoch 3/100
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 63ms/step - accuracy: 0.5730 - loss: 0.9321 - val_accuracy: 0.4257 - val_loss: 1.0957
Epoch 4/100
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 64ms/step - accuracy: 0.6069 - loss: 0.8916 - val_accuracy: 0.4482 - val_loss: 1.0618
Epoch 5/100
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 64ms/step - accuracy: 0.6207 - loss: 0.8596 - val_accuracy: 0.4657 - val_loss: 1.0625
Epoch 6/100
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 68ms/step - accuracy: 0.6259 - loss: 0.8524 - val_accuracy: 0.4722 - val_loss: 1.0501
Epoch 7/10

<keras.src.callbacks.history.History at 0x7f4afd9722e0>

In [22]:
X_val, y_val = load_data(filename="NLP_ass_valid.tsv")

In [23]:
# Generate word2vec vectors for all texts in validation data 
X_val_vectors = []
for text in X_val:
    X_val_vectors.append(text_to_word2vec(text)) 

X_val_vectors = np.array(X_val_vectors)

In [24]:
y_val_encoded = le.fit_transform(y_val)

In [25]:
from keras.utils import to_categorical

# Convert labels to one-hot vectors
y_val_one_hot = to_categorical(y_val_encoded)

In [26]:
y_val_one_hot

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [27]:
# Evaluate on validation data
loss, accuracy = model.evaluate(X_val_vectors, y_val_one_hot)
print("Validation Accuracy:", accuracy)

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.6374 - loss: 1.1060
Validation Accuracy: 0.5874089598655701


In [28]:
# Make predictions
y_pred_prob = model.predict(X_val_vectors)

# Convert probabilities into class labels
y_pred = np.argmax(y_pred_prob, axis=1)

from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_val_encoded, y_pred)

print(f'Validation Accuracy: {accuracy * 100:.2f}%') 


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Validation Accuracy: 58.74%


In [29]:
X_test, y_test = load_data("NLP_ass_test.tsv")

In [30]:
# Generate word2vec vectors for all texts in validation data 
X_test_vectors = []
for text in X_test:
    X_test_vectors.append(text_to_word2vec(text)) 

X_test_vectors = np.array(X_test_vectors)

In [31]:
y_test_encoded = le.fit_transform(y_test)
from keras.utils import to_categorical

# Convert labels to one-hot vectors
y_test_one_hot = to_categorical(y_test_encoded)

In [32]:
# Make predictions
y_pred_prob = model.predict(X_test_vectors)

# Convert probabilities into class labels
y_pred = np.argmax(y_pred_prob, axis=1)

from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)

print(f'Validation Accuracy: {accuracy * 100:.2f}%') 

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Validation Accuracy: 59.30%
