In [52]:
!pip install keras
!pip install tensorflow



In [45]:
import numpy as np
import pandas as pd
import re
import pickle
import os
import nltk
from nltk.corpus import stopwords
from textblob import Word
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.activations import softmax # Import softmax

In [46]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # For multilingual WordNet support
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
pip install -U textblob



In [48]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [49]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
file_path = '/content/test.ft.txt'  # Update this with the correct file path
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Process the data to extract labels and reviews
data = []
for line in lines:
    # Match the pattern: __label__ followed by a number and review text
    match = re.match(r"(__label__\d)\s(.+)", line.strip())
    if match:
        label = match.group(1)  # Extract the label
        review = match.group(2)  # Extract the review text
        data.append([label, review])

# Convert the data into a DataFrame
df = pd.DataFrame(data, columns=['Label', 'Review'])
df = df[:150000]


In [53]:
df.head()

Unnamed: 0,Label,Review
0,__label__2,Great CD: My lovely Pat has one of the GREAT v...
1,__label__2,One of the best game music soundtracks - for a...
2,__label__1,Batteries died within a year ...: I bought thi...
3,__label__2,"works fine, but Maha Energy is better: Check o..."
4,__label__2,Great for the non-audiophile: Reviewed quite a...


In [54]:
def cleaning(df, stop_words):
    # Convert to lowercase and split
    df['Review'] = df['Review'].apply(lambda x: ' '.join(x.lower() for x in x.split()))

    # Replacing special characters
    df['Review'] = df['Review'].str.replace('[^a-zA-Z\s]', '', regex=True)  # Keep only letters and spaces

    # Replacing digits/numbers
    df['Review'] = df['Review'].str.replace(r'\d+', '', regex=True)  # Remove all digits

    # Removing stop words
    df['Review'] = df['Review'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))

    # Lemmatization
    df['Review'] = df['Review'].apply(lambda x: ' '.join([Word(word).lemmatize() for word in x.split()]))

    return df

# Apply cleaning function
df = cleaning(df, stop_words)


In [55]:
tokenizer = Tokenizer(num_words=10000, split=' ')
tokenizer.fit_on_texts(df['Review'].values)

In [56]:

file_path = 'my_sentiment_model.pkl'  # Replace with your pickle file path
file_size = os.path.getsize(file_path)

print(f"The size of '{file_path}' is: {file_size} bytes")

The size of 'my_sentiment_model.pkl' is: 602968601 bytes


In [57]:
!md5sum my_sentiment_model.pkl

96f77109f6b1c3717766e35d2a2b15e3  my_sentiment_model.pkl


In [58]:
with open('my_sentiment_model.pkl', 'rb') as file:
  model = pickle.load(file)

In [61]:
def modifyReviews(reviews_array):
    """
    Preprocesses an array of input reviews to match the format used during training.

    Args:
        reviews_array (list): A list of reviews to be preprocessed.

    Returns:
        numpy.ndarray: The preprocessed reviews as a padded sequence.
    """
    import re # Import the re module for regular expression operations

    processed_reviews = []
    for review in reviews_array:
        # Convert to lowercase and split
        review = ' '.join(review.lower() for review in review.split())

        # Replacing special characters using re.sub()
        review = re.sub('[^a-zA-Z\s]', '', review)  # Keep only letters and spaces

        # Replacing digits/numbers using re.sub()
        review = re.sub(r'\d+', '', review)  # Remove all digits

        # Removing stop words
        review = ' '.join(review for review in review.split() if review not in stop_words)

        # Lemmatization
        review = ' '.join([Word(word).lemmatize() for word in review.split()])

        processed_reviews.append(review)

    # Convert to sequence and pad
    X = tokenizer.texts_to_sequences(processed_reviews)
    X = pad_sequences(X, maxlen=103)

    return X

In [80]:
new_reviews = [
    "This product is absolutely amazing! I love it.",
    "I'm so disappointed with this purchase. It's terrible.",
    "It's okay, nothing special.",
    "The quality is great, but the price is a bit high.",
    "I would highly recommend this to anyone.",
    "Worst product ever! Don't waste your money.",
    "This is a game-changer. I'm so impressed.",
    "It's pretty good, but could be better.",
    "I'm on the fence about this one. It has its pros and cons.",
    "This is a must-have for everyone!",
    "I had such high hopes, but it let me down.",
    "It's decent, but I've seen better.",
    "This is the best thing I've ever bought.",
    "I wouldn't buy this again.",
    "It's not bad, but not great either.",
    "I'm completely satisfied with this product.",
    "This is a total rip-off.",
    "It's worth every penny.",
    "I'm so glad I made this purchase.",
    "This is a huge waste of time and money.",
    "It's a good value for the price.",
    "I'm not sure if I like it or not.",
    "This is a hidden gem.",
    "I'm indifferent to this product.",
    "This is a life-saver!",
    "It's a mixed bag.",
    "I'm pleasantly surprised by how good this is.",
    "This is a complete disaster.",
    "It's a solid product.",
    "I'm not impressed at all.",
    "This is a great addition to my collection.",
    "It's not what I expected.",
    "This is a game-changer for me.",
    "I'm hesitant to recommend this.",
    "This is a must-buy!",
    "It's a bit underwhelming.",
    "I'm blown away by how good this is.",
    "This is a complete waste of money.",
    "It's a decent product, but nothing special.",
    "I'm very happy with my purchase.",
    "This is a total disappointment.",
    "It's a good product overall.",
    "I'm not sure if I would recommend this.",
    "This is a great find!",
    "It's a bit overpriced.",
    "I'm extremely satisfied with this product.",
    "This is a terrible product.",
    "It's a good value for money.",
    "I'm not a fan of this product.",
    "This is a great product for the price.",
    "It's not worth the money.",
    "This is a fantastic product!",
    "I'm very disappointed with this product.",
    "It's an okay product.",
    "I'm not sure what to think of this product.",
    "This is a great product overall.",
    "It's not bad for the price.",
    "This is a decent product.",
    "I'm not sure if I would buy this again.",
    "This is a good product.",
    "It's not the best product.",
    "This is a great product!",
    "I'm very happy with this product.",
]

In [83]:

# Preprocess the reviews array using modifyReviews function
processed_reviews = modifyReviews(new_reviews)

# Make predictions using your model
predictions = model.predict(processed_reviews)
# Apply softmax to the predictions
predictions = softmax(predictions).numpy() # Apply softmax and convert to numpy array

print(predictions)
# Classify reviews with neutral category
neutral_range = 0.1  # Adjust this range as needed
classified_reviews = []
for i, prediction in enumerate(predictions):
    diff = abs(prediction[1] - prediction[0])  # Difference between positive and negative
    if diff <= neutral_range:
        sentiment = "Neutral"
    elif prediction[1] > prediction[0]:
        sentiment = "Good"
    else:
        sentiment = "Not Good"
    classified_reviews.append((i + 1, sentiment))

# Print classified reviews
print("Classified Reviews:")
for review_num, sentiment in classified_reviews:
    print(f"Review {review_num}: {sentiment}")



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[[0.5164011  0.48359886]
 [0.5136472  0.4863528 ]
 [0.73105836 0.26894167]
 [0.28734273 0.7126573 ]
 [0.4200413  0.57995874]
 [0.2755241  0.7244759 ]
 [0.28248587 0.71751416]
 [0.27093968 0.7290603 ]
 [0.2700614  0.7299386 ]
 [0.28513774 0.7148623 ]
 [0.29477006 0.7052298 ]
 [0.7305067  0.26949328]
 [0.28333744 0.7166626 ]
 [0.69753766 0.30246222]
 [0.29100266 0.7089973 ]
 [0.7310578  0.26894218]
 [0.7307942  0.26920587]
 [0.27424374 0.7257563 ]
 [0.7309455  0.26905447]
 [0.306577   0.69342303]
 [0.27294052 0.72705936]
 [0.291683   0.708317  ]
 [0.28566948 0.71433043]
 [0.35217217 0.64782786]
 [0.27028826 0.7297117 ]
 [0.36161563 0.6383845 ]
 [0.2693133  0.7306866 ]
 [0.27431267 0.7256872 ]
 [0.4309026  0.56909734]
 [0.28248587 0.71751416]
 [0.26917472 0.7308253 ]
 [0.6499324  0.3500676 ]
 [0.53631943 0.46368068]
 [0.31912988 0.68087006]
 [0.53631943 0.46368068]
 [0.72291344 0.27708662]
 [0.27192554 0.72807443]
 [0

In [82]:
# Rank reviews based on positive sentiment probability
ranked_reviews = sorted(
    [(i, prediction[1]) for i, prediction in enumerate(predictions)],
    key=lambda item: item[1],
    reverse=True,  # Sort in descending order (highest probability first)
)

# Print ranked reviews
print("Ranked Reviews (from most good to least):")
for rank, (index, probability) in enumerate(ranked_reviews):
    print(f"Rank {rank + 1}: Review {index + 1} (Probability: {probability:.2f})")

Ranked Reviews (from most good to least):
Rank 1: Review 62 (Probability: 0.73)
Rank 2: Review 56 (Probability: 0.73)
Rank 3: Review 44 (Probability: 0.73)
Rank 4: Review 50 (Probability: 0.73)
Rank 5: Review 42 (Probability: 0.73)
Rank 6: Review 61 (Probability: 0.73)
Rank 7: Review 31 (Probability: 0.73)
Rank 8: Review 60 (Probability: 0.73)
Rank 9: Review 27 (Probability: 0.73)
Rank 10: Review 9 (Probability: 0.73)
Rank 11: Review 25 (Probability: 0.73)
Rank 12: Review 53 (Probability: 0.73)
Rank 13: Review 8 (Probability: 0.73)
Rank 14: Review 37 (Probability: 0.73)
Rank 15: Review 21 (Probability: 0.73)
Rank 16: Review 48 (Probability: 0.73)
Rank 17: Review 18 (Probability: 0.73)
Rank 18: Review 28 (Probability: 0.73)
Rank 19: Review 6 (Probability: 0.72)
Rank 20: Review 49 (Probability: 0.72)
Rank 21: Review 7 (Probability: 0.72)
Rank 22: Review 30 (Probability: 0.72)
Rank 23: Review 13 (Probability: 0.72)
Rank 24: Review 10 (Probability: 0.71)
Rank 25: Review 23 (Probability: 0.