# Techdome Solutions Private Limited
## AI Engineer Assignment

Assignment Title: Natural Language Processing for Sentiment Analysis


## Data Collection

In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalAveragePooling1D

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Techdome_Solutions_Assignment/hate.csv")
df.head()

Unnamed: 0,serial_number,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N


In [2]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Write a function to preprocess the text data for sentiment analysis
def preprocess_text(text):
    # Remove symbols including '<'
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove occurrences of '<<'
    text = text.replace('<<', '')
    # Remove integers
    text = re.sub(r'\b\d+\b', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and convert to lowercase
    tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

df['processed_comment'] = df['comment'].apply(preprocess_text)


In [4]:
# Check the shape
df.shape

(41144, 4)

In [5]:
# Remove rows with label 'O'
df = df[df['label'] != 'O']

In [6]:
df["label"].value_counts()

N    22158
P    18950
Name: label, dtype: int64

In [7]:
df.head()

Unnamed: 0,serial_number,comment,label,processed_comment
0,1,dalits are lowlives,N,dalits lowlives
1,2,gay people are a burden to our society,N,gay people burden society
2,3,Arabs are not welcome,N,arabs welcome
3,4,I'm not saying we should actually eliminate he...,N,im saying actually eliminate heebs wish natura...
4,5,bananas are for black people,N,bananas black people


## Data Preprocessing


In [8]:
# Assuming 'comment' is the text data and 'label' is the sentiment label
X = df['processed_comment']
y = df['label']

# Convert labels to numerical format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


## Model Selection, Model Training

In [9]:
# Define the model
model = Sequential()
model.add(Embedding(max_words, 32, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(16,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train, epochs=20, batch_size=64, validation_data = (X_test_pad, y_test))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7a0168126380>

## Evaluation

In [11]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Accuracy: {accuracy}')


Accuracy: 0.6350036263465881


## Test the trained model on some text examples

In [12]:
new_texts = [
    "I absolutely love this product! It's amazing.",
    "The service was terrible, and I'm very disappointed.",
    "This movie is fantastic. I can't wait to see it again!",
    "The food at that restaurant was awful. I won't be going back.",
    "I'm so happy with my new phone. It works perfectly.",
    "The customer support was unhelpful and rude. I had a bad experience.",
    "This book is a masterpiece. I highly recommend it to everyone.",
    "The software is full of bugs. It's frustrating to use.",
    "I had a great time at the concert. The performance was outstanding.",
    "The delivery was delayed, and the product arrived damaged. Very disappointed."
]

# Tokenize and pad the new text
new_texts_seq = tokenizer.texts_to_sequences(new_texts)
new_texts_pad = pad_sequences(new_texts_seq, maxlen=max_len)

# Make predictions
new_predictions_prob = model.predict(new_texts_pad)
new_predictions = (new_predictions_prob > 0.5).astype(int)

# Decode predictions to original labels
new_predictions_labels = label_encoder.inverse_transform(new_predictions.flatten())

# Print the results
# Print the results
for text, label in zip(new_texts, new_predictions_labels):
    sentiment = "Positive" if label == 'P' else "Negative"
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")


Text: I absolutely love this product! It's amazing.
Predicted Sentiment: Positive

Text: The service was terrible, and I'm very disappointed.
Predicted Sentiment: Positive

Text: This movie is fantastic. I can't wait to see it again!
Predicted Sentiment: Positive

Text: The food at that restaurant was awful. I won't be going back.
Predicted Sentiment: Positive

Text: I'm so happy with my new phone. It works perfectly.
Predicted Sentiment: Positive

Text: The customer support was unhelpful and rude. I had a bad experience.
Predicted Sentiment: Positive

Text: This book is a masterpiece. I highly recommend it to everyone.
Predicted Sentiment: Positive

Text: The software is full of bugs. It's frustrating to use.
Predicted Sentiment: Positive

Text: I had a great time at the concert. The performance was outstanding.
Predicted Sentiment: Positive

Text: The delivery was delayed, and the product arrived damaged. Very disappointed.
Predicted Sentiment: Negative



In [13]:
# Save the model architecture and weights
model.save('sentiment_analysis_model.h5')

  saving_api.save_model(


In [14]:
# Save the tokenizer for Future Use
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)