In [142]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [143]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [144]:
from nltk.stem import WordNetLemmatizer

In [145]:
nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [146]:
df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')


In [147]:
print(df.shape)

(20000, 2)


In [35]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  20000 non-null  object
 1   Positive    20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [148]:
df.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [36]:
def prepro(txt):
    toks = word_tokenize(txt.lower())
    sw = set(stopwords.words('english'))
    stoptoks = [t for t in toks if t not in sw]
    lm = WordNetLemmatizer()
    lemtoks = [lm.lemmatize(t) for t in stoptoks]
    outtxt = ' '.join(lemtoks)

    return outtxt


In [149]:
df['rv'] = df['reviewText'].apply(prepro)


In [150]:
df.head()

Unnamed: 0,reviewText,Positive,rv
0,This is a one of the best apps acording to a b...,1,one best apps acording bunch people agree bomb...
1,This is a pretty good version of the game for ...,1,pretty good version game free . lot different ...
2,this is a really cool game. there are a bunch ...,1,really cool game . bunch level find golden egg...
3,"This is a silly game and can be frustrating, b...",1,"silly game frustrating , lot fun definitely re..."
4,This is a terrific game on any pad. Hrs of fun...,1,terrific game pad . hr fun . grandkids love . ...


In [39]:
analyzer = SentimentIntensityAnalyzer()


In [40]:
def senti(text):
    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['pos'] > 0 else 0
    return sentiment


In [138]:
df['sentiment'] = df['reviewText'].apply(senti)
df.head()


Unnamed: 0,reviewText,Positive,sentiment
0,this is a one of the best apps acording to a b...,1,1
1,this is a pretty good version of the game for ...,1,1
2,this is a really cool game there are a bunch o...,1,1
3,this is a silly game and can be frustrating bu...,1,1
4,this is a terrific game on any pad hrs of fun ...,1,1


In [15]:
!pip install scikit-learn




In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [45]:

sample_df = df.copy()
y_true = sample_df['Positive']
y_pred = sample_df['sentiment']
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print(f"Acc: {accuracy*100:.2f}%")
print(f"Prec: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")

# Optional: detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=['Negative', 'Positive']))


Acc: 79.95%
Prec: 81.17%
Recall: 95.93%
F1 Score: 87.93%

Classification Report:

              precision    recall  f1-score   support

    Negative       0.69      0.29      0.41      4767
    Positive       0.81      0.96      0.88     15233

    accuracy                           0.80     20000
   macro avg       0.75      0.62      0.64     20000
weighted avg       0.78      0.80      0.77     20000



In [82]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random


In [83]:
# Positive and negative keywords
pos_kw = ['good', 'amazing', 'excellent', 'fantastic', 'great', 'awesome', 'perfect', 'nice', 'love', 'happy',
          'satisfied', 'wonderful', 'best', 'superb', 'impressive', 'top', 'brilliant', 'positive', 'reliable', 'high',
          'recommend', 'favorite', 'pleasant', 'valuable', 'efficient', 'quality', 'quick', 'comfortable', 'safe', 'durable',
          'strong', 'fun', 'bright', 'fresh', 'clean', 'effective', 'easy', 'helpful', 'awesome', 'friendly', 'responsive',
          'luxury', 'perfect', 'best', 'super', 'goodness', 'pleasure', 'trusted', 'delightful', 'awesome', 'excellent']

neg_kw = ['bad', 'poor', 'terrible', 'horrible', 'worst', 'awful', 'disappointing', 'useless', 'slow', 'dirty',
          'unreliable', 'broken', 'expensive', 'negative', 'problem', 'annoying', 'frustrating', 'hate', 'messy', 'difficult',
          'weak', 'cheap', 'boring', 'sad', 'uncomfortable', 'late', 'ugly', 'faulty', 'pain', 'waste',
          'stress', 'hard', 'confusing', 'stupid', 'unhappy', 'badly', 'lazy', 'unfriendly', 'poorly', 'flawed',
          'rude', 'defective', 'slowly', 'untrustworthy', 'regret', 'hassle', 'dislike', 'terribly', 'problematic', 'inferior']

# Create reviews
pos_rev = [f"This {k} product" for k in pos_kw[:50]]
neg_rev = [f"This {k} product" for k in neg_kw[:50]]

random.shuffle(pos_rev)
random.shuffle(neg_rev)

# Combine into dataframe
df = pd.DataFrame({
    'reviewText': pos_rev + neg_rev,
    'Positive': [1]*50 + [0]*50
})

# Shuffle rows
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head(10))
print(f"Total samples: {len(df)}")


                reviewText  Positive
0      This slowly product         0
1      This flawed product         0
2         This bad product         0
3   This brilliant product         1
4     This perfect product         1
5       This clean product         1
6    This favorite product         1
7        This late product         0
8  This responsive product         1
9       This quick product         1
Total samples: 100


In [84]:
def preprocess(txt):
    txt = txt.lower()
    txt = re.sub(r'\d+', '', txt)
    txt = re.sub(r'[^\w\s]', '', txt)
    return txt

df['reviewText'] = df['reviewText'].apply(preprocess)


In [85]:
txts = df['reviewText'].values
lbls = df['Positive'].values

tok = Tokenizer(num_words=5000, lower=True)
tok.fit_on_texts(txts)

seqs = tok.texts_to_sequences(txts)
X = pad_sequences(seqs, padding='post')
y = np.array(lbls)


In [86]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [87]:
model.fit(X, y, epochs=10, batch_size=4)


Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - accuracy: 0.5608 - loss: 0.6951
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.6135 - loss: 0.6888
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8907 - loss: 0.6776
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9940 - loss: 0.6149
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 1.0000 - loss: 0.3648
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 1.0000 - loss: 0.0517
Epoch 7/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 1.0000 - loss: 0.0023
Epoch 8/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 1.0000 - loss: 0.0014
Epoch 9/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c7215004aa0>

In [88]:
def predict(sent):
    sent = preprocess(sent)
    seq = tok.texts_to_sequences([sent])
    padseq = pad_sequences(seq, padding='post', maxlen=X.shape[1])
    pred = model.predict(padseq)
    return "Positive" if pred>=0.5 else "Negative"


In [92]:
print("Enter sentence (type exit to quit):")
while True:
    usr = input("> ")
    if usr.lower()=='exit':
        print("Bye!")
        break
    print(f"Predicted sentiment: {predict(usr)}")


Enter sentence (type exit to quit):
> not so bad
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Predicted sentiment: Negative
> not bad
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Predicted sentiment: Negative
> i like this
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Predicted sentiment: Negative
> nice product
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Predicted sentiment: Positive
> i love this product
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Predicted sentiment: Positive
> happy me
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Predicted sentiment: Positive
> exit
Bye!


In [93]:
# Predict on all samples
y_pred = (model.predict(X) >= 0.5).astype(int).flatten()
y_true = y


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 


In [94]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")


Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1 Score: 100.00%


In [95]:
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=['Negative', 'Positive']))



Classification Report:

              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        50
    Positive       1.00      1.00      1.00        50

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [130]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [132]:
df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')

df = df[['reviewText','Positive']].dropna()
print(df.head())


                                          reviewText  Positive
0  This is a one of the best apps acording to a b...         1
1  This is a pretty good version of the game for ...         1
2  this is a really cool game. there are a bunch ...         1
3  This is a silly game and can be frustrating, b...         1
4  This is a terrific game on any pad. Hrs of fun...         1


In [134]:
def preprocess(txt):
    txt = str(txt).lower()
    txt = re.sub(r'\d+', '', txt)
    txt = re.sub(r'[^\w\s]', '', txt)
    return txt
df['reviewText'] = df['reviewText'].apply(preprocess)


In [135]:
txts = df['reviewText'].values
lbls = df['Positive'].values

tok = Tokenizer(num_words=5000, lower=True)
tok.fit_on_texts(txts)
seqs = tok.texts_to_sequences(txts)
X = pad_sequences(seqs, padding='post')
y = np.array(lbls)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [136]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [137]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 205ms/step - accuracy: 0.8168 - loss: 0.4024 - val_accuracy: 0.8938 - val_loss: 0.2605
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 207ms/step - accuracy: 0.9258 - loss: 0.2020 - val_accuracy: 0.9035 - val_loss: 0.2340
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 206ms/step - accuracy: 0.9386 - loss: 0.1637 - val_accuracy: 0.9035 - val_loss: 0.2421
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 202ms/step - accuracy: 0.9502 - loss: 0.1403 - val_accuracy: 0.9010 - val_loss: 0.2929
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 213ms/step - accuracy: 0.9602 - loss: 0.1070 - val_accuracy: 0.8967 - val_loss: 0.2814


<keras.src.callbacks.history.History at 0x7c72166d90a0>

In [139]:
def predict(sent):
    sent = preprocess(sent)
    seq = tok.texts_to_sequences([sent])
    padseq = pad_sequences(seq, padding='post', maxlen=X.shape[1])
    pred = model.predict(padseq)
    return "Positive" if pred>=0.5 else "Negative"

print("Enter sentence (type exit to quit):")
while True:
    usr = input("> ")
    if usr.lower()=='exit':
        print("Bye!")
        break
    print(f"Predicted sentiment: {predict(usr)}")


Enter sentence (type exit to quit):
> i love this product
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 774ms/step
Predicted sentiment: Positive
> this is the best product i have ever seen
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Predicted sentiment: Positive
> the worst product
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
Predicted sentiment: Negative
> very  bad
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Predicted sentiment: Negative
> exit
Bye!


In [140]:
y_pred = (model.predict(X_test) >= 0.5).astype(int).flatten()
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Acc: {accuracy*100:.2f}%")
print(f"Prec: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 46ms/step
Acc: 89.68%
Prec: 93.77%
Recall: 92.57%
F1 Score: 93.17%


In [108]:

!pip install nltk transformers




In [109]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import nltk
from nltk.tokenize import sent_tokenize

In [110]:
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [111]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [112]:
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


Device set to use cpu


In [113]:
def prepro(text):
    sentences = sent_tokenize(text)
    return sentences


In [114]:
def sentiment(text):
    sentences = prepro(text)
    res = []
    for sentence in sentences:
        sentiment = sentiment_pipeline(sentence)
        res.append((sentence, sentiment[0]['label'], sentiment[0]['score']))
    return res

In [115]:
text = "I love the new phone. The battery life is amazing! However, the camera quality could be better. Oshin. Hey look at me. Do you remember? Please buy me a chocolate. I hate you idiot."
res = sentiment(text)

In [116]:
for sentence, label, score in res:
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {label}, Score: {score}")
    print("-" * 50)


Sentence: I love the new phone.
Sentiment: 5 stars, Score: 0.8152029514312744
--------------------------------------------------
Sentence: The battery life is amazing!
Sentiment: 5 stars, Score: 0.8153871893882751
--------------------------------------------------
Sentence: However, the camera quality could be better.
Sentiment: 3 stars, Score: 0.621445894241333
--------------------------------------------------
Sentence: Oshin.
Sentiment: 3 stars, Score: 0.32453295588493347
--------------------------------------------------
Sentence: Hey look at me.
Sentiment: 5 stars, Score: 0.4124619662761688
--------------------------------------------------
Sentence: Do you remember?
Sentiment: 5 stars, Score: 0.26115894317626953
--------------------------------------------------
Sentence: Please buy me a chocolate.
Sentiment: 1 star, Score: 0.4337516129016876
--------------------------------------------------
Sentence: I hate you idiot.
Sentiment: 1 star, Score: 0.7333212494850159
---------------