In this notebook I use an NLP approach (Natural Language Processing) and apply Logistic Regression, SVM, and Neural Networks to build a model for sentiment analysis.

In [None]:
import pandas as pd
import re
import nltk
from num2words import num2words

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder


nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

#load dataset

In [None]:
train_df = pd.read_csv("/content/Amazon_train.csv")
test_df = pd.read_csv("/content/Amazon_test.csv")

In [None]:
train_df.head(5)


Unnamed: 0,id,text,polarity
0,0,Please note: I am giving a 1-star rating to th...,neg
1,1,I hate this new sidekick even more than the do...,neg
2,2,This was a great movie. I like the way it fol...,neg
3,3,I gave this 2 stars only because of the nice c...,neg
4,4,"George Clooney was a joke, and while Val Kilme...",neg


***Preprocessing data***

In [None]:
#Lower Casing
train_df["text"] = train_df["text"].str.lower()
test_df["text"] = test_df["text"].str.lower()



In [None]:

# here i convert the number
def numbers_to_words(text):
    return re.sub(r"\b\d+\b", lambda x: num2words(int(x.group())), text)

train_df["text"] = train_df["text"].apply(numbers_to_words)
test_df["text"] = test_df["text"].apply(numbers_to_words)


In [None]:
#Removing Punctuation

def clean_text(text):
    text = re.sub(r"<[^>]+>", " ", text)

    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df["text"] = train_df["text"].apply(clean_text)
test_df["text"] = test_df["text"].apply(clean_text)




In [None]:
# Tokenization
train_df["tokens"] = train_df["text"].apply(word_tokenize)
test_df["tokens"] = test_df["text"].apply(word_tokenize)

In [None]:
train_df.head(5)

Unnamed: 0,id,text,polarity,tokens
0,0,please note i am giving a one star rating to t...,neg,"[please, note, i, am, giving, a, one, star, ra..."
1,1,i hate this new sidekick even more than the do...,neg,"[i, hate, this, new, sidekick, even, more, tha..."
2,2,this was a great movie i like the way it folde...,neg,"[this, was, a, great, movie, i, like, the, way..."
3,3,i gave this two stars only because of the nice...,neg,"[i, gave, this, two, stars, only, because, of,..."
4,4,george clooney was a joke and while val kilmer...,neg,"[george, clooney, was, a, joke, and, while, va..."


In [None]:
# Delete stop words
stop_words = set(stopwords.words("english"))
train_df["tokens"] = train_df["tokens"].apply(lambda x: [w for w in x if w not in stop_words])
test_df["tokens"] = test_df["tokens"].apply(lambda x: [w for w in x if w not in stop_words])


In [None]:
# POS tagging
train_df["pos"] = train_df["tokens"].apply(nltk.pos_tag)
test_df["pos"] = test_df["tokens"].apply(nltk.pos_tag)


In [None]:
# Mapping POS to WordNet for lemmatization
def get_wordnet_pos(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("N"):
        return wordnet.NOUN
    elif tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_with_pos(pos_tokens):
    return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tokens]

train_df["tokens"] = train_df["pos"].apply(lemmatize_with_pos)
test_df["tokens"] = test_df["pos"].apply(lemmatize_with_pos)

In [None]:
train_df.head(5)

Unnamed: 0,id,text,polarity,tokens,pos
0,0,please note i am giving a one star rating to t...,neg,"[please, note, give, one, star, rating, overal...","[(please, VB), (note, NN), (giving, VBG), (one..."
1,1,i hate this new sidekick even more than the do...,neg,"[hate, new, sidekick, even, donkey, face, firs...","[(hate, VB), (new, JJ), (sidekick, JJ), (even,..."
2,2,this was a great movie i like the way it folde...,neg,"[great, movie, like, way, fold, romantic, yet,...","[(great, JJ), (movie, NN), (like, IN), (way, N..."
3,3,i gave this two stars only because of the nice...,neg,"[give, two, star, nice, cinematography, never,...","[(gave, VBD), (two, CD), (stars, NNS), (nice, ..."
4,4,george clooney was a joke and while val kilmer...,neg,"[george, clooney, joke, val, kilmer, obviously...","[(george, NN), (clooney, NN), (joke, NN), (val..."


In [None]:
# Reconstruction of the text
train_df["clean_text"] = train_df["tokens"].apply(lambda x: " ".join(x))
test_df["clean_text"] = test_df["tokens"].apply(lambda x: " ".join(x))

**vectorisation** **TF** **IDF**


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#  Ignore missing labels or empty text for training
train_valid = train_df.dropna(subset=["polarity"])
train_valid = train_valid[train_valid["clean_text"].str.strip() != ""]

#  TF-IDF vectorization
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_train = tfidf.fit_transform(train_valid["clean_text"])
X_test  = tfidf.transform(test_df["clean_text"])


In [None]:

feature_names = tfidf.get_feature_names_out()

df_vectors = pd.DataFrame(X_train[:5].toarray(), columns=feature_names)

print(df_vectors)


   101st  101st airborne  1080i  1080i 1080p  1080p  1080p blu  1080p high  \
0    0.0             0.0    0.0          0.0    0.0        0.0         0.0   
1    0.0             0.0    0.0          0.0    0.0        0.0         0.0   
2    0.0             0.0    0.0          0.0    0.0        0.0         0.0   
3    0.0             0.0    0.0          0.0    0.0        0.0         0.0   
4    0.0             0.0    0.0          0.0    0.0        0.0         0.0   

   10th  10x  11th  ...  zombie well  zombie zombie  zombies  zone  \
0   0.0  0.0   0.0  ...          0.0            0.0      0.0   0.0   
1   0.0  0.0   0.0  ...          0.0            0.0      0.0   0.0   
2   0.0  0.0   0.0  ...          0.0            0.0      0.0   0.0   
3   0.0  0.0   0.0  ...          0.0            0.0      0.0   0.0   
4   0.0  0.0   0.0  ...          0.0            0.0      0.0   0.0   

   zone episode  zoo  zoom  zorro  zulu  zuniga  
0           0.0  0.0   0.0    0.0   0.0     0.0  
1         

***Regression*** ***Logistic***

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

#  Encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(train_valid["polarity"])

# Train Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)


y_train_pred = model.predict(X_train)

accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred)
print("Train Accuracy:", accuracy)
print("Train Precision:", precision)
print(classification_report(y_train, y_train_pred))



Train Accuracy: 0.9503333333333334
Train Precision: 0.9427819796119697
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      3009
           1       0.94      0.96      0.95      2991

    accuracy                           0.95      6000
   macro avg       0.95      0.95      0.95      6000
weighted avg       0.95      0.95      0.95      6000



In [None]:
#  Predict on test set
y_test_pred = model.predict(X_test)
test_df["predicted_polarity"] = encoder.inverse_transform(y_test_pred)

print(test_df[["clean_text", "predicted_polarity"]].head(10))

                                          clean_text predicted_polarity
0  feed broadcast corporation cancel show even sl...                pos
1  six month tenchi wander forest disappeared ryo...                pos
2  total recall enjoyable arnold schwarzenegger a...                pos
3  daughter family happy receive movie one movie ...                pos
4  good thing excellent time capsule music form s...                pos
5  unusual sliding banister p l travers abject wo...                pos
6  could take minute movie simply brainless could...                neg
7  love movie year think one first anime saw howe...                pos
8  thanks god bless family friends etc provide mu...                pos
9  else say movie um sequel previous alien encoun...                neg


***SVM***

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# Train SVM
svm_model = LinearSVC(max_iter=5000, random_state=42)
svm_model.fit(X_train, y_train)

y_train_pred_svm = svm_model.predict(X_train)
accuracy_svm = accuracy_score(y_train, y_train_pred_svm)
print("SVM Train Accuracy:", accuracy_svm)
print(classification_report(y_train, y_train_pred_svm))

# Predict on test set
y_test_pred_svm = svm_model.predict(X_test)
test_df["predicted_polarity_svm"] = encoder.inverse_transform(y_test_pred_svm)
print(test_df[["clean_text", "predicted_polarity"]].head(10))

SVM Train Accuracy: 0.9998333333333334
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3009
           1       1.00      1.00      1.00      2991

    accuracy                           1.00      6000
   macro avg       1.00      1.00      1.00      6000
weighted avg       1.00      1.00      1.00      6000

                                          clean_text predicted_polarity
0  feed broadcast corporation cancel show even sl...                pos
1  six month tenchi wander forest disappeared ryo...                pos
2  total recall enjoyable arnold schwarzenegger a...                pos
3  daughter family happy receive movie one movie ...                pos
4  good thing excellent time capsule music form s...                pos
5  unusual sliding banister p l travers abject wo...                pos
6  could take minute movie simply brainless could...                neg
7  love movie year think one first anime saw howe...      

***Neural*** ***Network***

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Convert sparse matrix to dense for Keras
X_train_dense = X_train.toarray()
X_test_dense  = X_test.toarray()

# Neural network model
nn_model = Sequential()
nn_model.add(Dense(256, activation='relu', input_shape=(X_train_dense.shape[1],)))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(128, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1, activation='sigmoid'))  # Binary classification

nn_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

nn_model.fit(X_train_dense, y_train, epochs=5, batch_size=32, verbose=1)

train_loss, train_acc = nn_model.evaluate(X_train_dense, y_train, verbose=0)
print("NN Train Accuracy:", train_acc)

y_test_pred_nn = (nn_model.predict(X_test_dense) > 0.5).astype(int).flatten()
test_df["predicted_polarity_nn"] = encoder.inverse_transform(y_test_pred_nn)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 387ms/step - accuracy: 0.7103 - loss: 0.5945
Epoch 2/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 343ms/step - accuracy: 0.9639 - loss: 0.1096
Epoch 3/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 318ms/step - accuracy: 0.9973 - loss: 0.0137
Epoch 4/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 287ms/step - accuracy: 0.9998 - loss: 0.0025
Epoch 5/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 282ms/step - accuracy: 1.0000 - loss: 7.6456e-04
NN Train Accuracy: 1.0
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step


In [None]:

nn_model.save("nn_model.h5")

from tensorflow.keras.models import load_model
nn_model_loaded = load_model("nn_model.h5")


