<a href="https://colab.research.google.com/github/OmarK920/NLP-Projects-/blob/main/Amazon_Reviews_for_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# File Reading and loading

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import bz2
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
!unzip /content/Amazonreviews.zip

Archive:  /content/Amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [None]:
train_file = bz2.BZ2File('/content/train.ft.txt.bz2')
test_file = bz2.BZ2File('/content/test.ft.txt.bz2')

In [None]:
def load_extract(file):
    texts, labels = [], []
    for line in file:
        x = line.decode('utf-8')  # decode binary to string
        labels.append(int(x[9]) - 1)  # extract labels
        texts.append(x[10:].strip())  # extract texts
    print('Done !')
    return np.array(labels), texts

In [None]:
train_labels, train_texts = load_extract(train_file)
test_labels, test_texts = load_extract(test_file)

Done !
Done !


In [None]:
train_texts[0]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

# New Section
In this section i will be doing LSTM model for sentiment prediction. but first i have to preprocess the data

# Preprocessing

- Replaces digits with '0'.

- Removes URLs and links.

- Converts all characters to lowercase.

- Tokenizes the text into words.

- Removes English stopwords.

- Lemmatizes each word.

- Joins the words back into a string.

In [None]:
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def clean_texts(texts):
    stwords = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    temp_texts = []

    for i in range(len(texts)):
        text = re.sub('\d', '0', texts[i])  # replace every digit with 0
        if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text:  # remove links and urls
            text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", " ", text)

        text = re.sub('[^a-zA-Z]', ' ', text)  # anything which is not a character replace with whitespace char
        text = text.lower()
        text = word_tokenize(text)  # tokenize the text
        text = [word for word in text if not word in stwords]  # remove stopwords
        text = [lemmatizer.lemmatize(word) for word in text]  # lemmatization
        text = ' '.join(text)

        temp_texts.append(text)

    print('--100%--Done!')
    return temp_texts


In [None]:
print('Processing Training data')
train_texts = clean_texts(train_texts)
print('\nProcessing Test data')
test_texts = clean_texts(test_texts)

Processing Training data
--100%--Done!

Processing Test data
--100%--Done!


# Modeling

In [None]:
from sklearn.model_selection import train_test_split
# Assuming you have train_texts, test_texts, train_labels, and test_labels
X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)


**LSTM**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Tokenize and pad sequences
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index
X_train_pad = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100, padding='post', truncating='post')
X_test_pad = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100, padding='post', truncating='post')

# Build and train an even simpler LSTM model
model_lstm_simplest = Sequential()
model_lstm_simplest.add(Embedding(input_dim=len(word_index) + 1, output_dim=16, input_length=100))
model_lstm_simplest.add(LSTM(16))
model_lstm_simplest.add(Dense(1, activation='sigmoid'))

model_lstm_simplest.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with 3 epochs
history_lstm_simplest = model_lstm_simplest.fit(
    X_train_pad, y_train,
    epochs=3,
    validation_split=0.2,
    verbose=1
)

# Access accuracy history during training
accuracy_lstm_simplest = history_lstm_simplest.history['accuracy']
val_accuracy_lstm_simplest = history_lstm_simplest.history['val_accuracy']

# Print accuracy at each epoch
for epoch, acc, val_acc in zip(range(1, 4), accuracy_lstm_simplest, val_accuracy_lstm_simplest):
    print(f"Epoch {epoch} - Training Accuracy: {acc:.4f} - Validation Accuracy: {val_acc:.4f}")


Epoch 1/3
Epoch 2/3
 4829/72000 [=>............................] - ETA: 6:29:09 - loss: 0.2635 - accuracy: 0.8944

In the next section i will use logistic regression model to classify

# Text Vectorization

In [None]:
print('Fitting data...')
count_vect = CountVectorizer()
count_vect.fit(train_texts) ; print('fit complete !')

print('tranforming training set...')
train_texts_vec = count_vect.transform(train_texts)

print('tranforming test set...')
test_texts_vec = count_vect.transform(test_texts)

Fitting data...
fit complete !
tranforming training set...
tranforming test set...


# Model

Logistic Regression Model for text classification

In [None]:
lr_model = LogisticRegression(n_jobs=-1, max_iter=150)
lr_model.fit(train_texts_vec, train_labels)

In [None]:
pred_lr = lr_model.predict(test_texts_vec)


In [None]:
print('Accuracy:', accuracy_score(test_labels, pred_lr))


Accuracy: 0.900615


In [None]:
sample = test_texts[265]
print(sample)

sample_vec = count_vect.transform([sample])
pred = lr_model.predict(sample_vec)
print('\npredicted label:',pred[0])
print('actual label:', test_labels[100])

big deal thing interesting album mia accent seriously beats singing layered tired clich vocals album anyone else would top final verdict bad music

predicted label: 0
actual label: 0
