## Installing and Importing packages

In [1]:
!pip install tf-keras




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import string
import re
import fasttext
import urllib.request

## Read CSV file

In [3]:
df1 =  pd.read_csv("merged_data.csv")

In [4]:
print(df1.shape)
df1.head(3)

(9165, 1)


Unnamed: 0,reviews
0,
1,
2,"OnePlus has outdone itself this time, deliveri..."


In [5]:
df1.isna().value_counts()

reviews
False      8047
True       1118
Name: count, dtype: int64

### Removing Null Values

In [6]:
df1 = df1.dropna()

In [7]:
df1.shape

(8047, 1)

## Text Preprocessing

In [8]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans("","",string.punctuation))
    text = re.sub("\\n"," ",text)
    return text

In [9]:
df1['preprocessed_text'] = df1['reviews'].apply(preprocess)
df1.head(2)

Unnamed: 0,reviews,preprocessed_text
2,"OnePlus has outdone itself this time, deliveri...",oneplus has outdone itself this time deliverin...
3,A powerhouse of performance and style! ✨ The s...,a powerhouse of performance and style ✨ the st...


### Removing languages other than english

In [10]:
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
urllib.request.urlretrieve(url, "lid.176.bin")

# Load model
model = fasttext.load_model("lid.176.bin")

# Test detection
print(model.predict("This is an English review."))  # Expected: __label__en
print(model.predict("Ceci est un avis en français."))  # Expected: __label__fr

(('__label__en',), array([0.95950234]))
(('__label__fr',), array([0.99683094]))


In [11]:
df1 = df1.reset_index(drop=True)

In [12]:
def remove_other_languages(text):
    if model.predict(text)[0][0] == '__label__en':
        return text
    else:
        return ''
df1['eng_reviews'] = df1['preprocessed_text'].apply(remove_other_languages)

In [13]:
df2 = pd.DataFrame()
df2 = df1[df1['eng_reviews'].str.len()>0]

In [14]:
df2.shape

(7778, 3)

In [15]:
df2 = df2.reset_index(drop=True)

## Data Labelling 
- Used transformers pre-trained pipeline for sentiment analysis for accurate labels

In [16]:
from transformers import pipeline

model = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.





Device set to use cpu


In [17]:
sentiment = []
def predict_sentiment(text,id):
    try:
        sentiment.append(model(text)[0]['label'])
    except RuntimeError:
        sentiment.append('')
        print("failed for",id)
for i,text in enumerate(df2['eng_reviews']):
    predict_sentiment(text, i)



Token indices sequence length is longer than the specified maximum sequence length for this model (1024 > 512). Running this sequence through the model will result in indexing errors


failed for 2
failed for 7
failed for 31
failed for 49
failed for 92
failed for 134
failed for 148
failed for 255
failed for 315
failed for 318
failed for 345
failed for 375
failed for 394
failed for 484
failed for 492
failed for 515
failed for 525
failed for 563
failed for 644
failed for 652
failed for 678
failed for 690
failed for 698
failed for 727
failed for 811
failed for 816
failed for 854
failed for 921
failed for 944
failed for 973
failed for 982
failed for 1012
failed for 1021
failed for 1028
failed for 1035
failed for 1059
failed for 1062
failed for 1080
failed for 1136
failed for 1149
failed for 1259
failed for 1288
failed for 1291
failed for 1378
failed for 1406
failed for 1408
failed for 1418
failed for 1505
failed for 1506
failed for 1598
failed for 1697
failed for 1700
failed for 1879
failed for 1940
failed for 1950
failed for 2297
failed for 2317
failed for 2319
failed for 2357
failed for 2358
failed for 2370
failed for 2373
failed for 2403
failed for 2487
failed for 254

Some reviews exceed the model's word limit, so the model fails at those reviews.

In [18]:
print(sentiment.count('POSITIVE'))
print(sentiment.count('NEGATIVE'))
print(sentiment.count(''))

4634
2994
150


The model fails for exactly 150 reviews, further removing them from the dataset.

In [19]:
df2['sentiments'] = sentiment

In [20]:
df3 = df2[df2['sentiments']!=''] 

In [21]:
df3 = df3.reset_index(drop=True)

In [22]:
df3['sentiments'] = df3['sentiments'].apply(lambda x: 1 if x=='POSITIVE' else 0)

### Stemming and Stopword removal

In [23]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [24]:
stemmer = PorterStemmer()
def stemming(text):
    text =  re.sub(r"[^a-z]"," ", text)
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stopwords.words('english')]
    text = " ".join(text)
    return text

In [25]:
df3['stemmed_text'] = df3['eng_reviews'].apply(stemming)

In [26]:
df3.head()

Unnamed: 0,reviews,preprocessed_text,eng_reviews,sentiments,stemmed_text
0,"OnePlus has outdone itself this time, deliveri...",oneplus has outdone itself this time deliverin...,oneplus has outdone itself this time deliverin...,1,oneplu outdon time deliv smartphon truli live ...
1,A powerhouse of performance and style! ✨ The s...,a powerhouse of performance and style ✨ the st...,a powerhouse of performance and style ✨ the st...,1,powerhous perform style stun display captiv ca...
2,Surprisingly good product for the price.\nI ha...,surprisingly good product for the price i have...,surprisingly good product for the price i have...,1,surprisingli good product price gener purchas ...
3,Bought the phone a month ago. Safe to say at t...,bought the phone a month ago safe to say at th...,bought the phone a month ago safe to say at th...,0,bought phone month ago safe say price rang ban...
4,"This is my first OnePlus device, before this i...",this is my first oneplus device before this ia...,this is my first oneplus device before this ia...,1,first oneplu devic iam user poco believ oxygen...


Saving the data frame for further usage

In [None]:
df3.to_csv('stemmed_reviews.csv', index=False)

In [27]:
df3['stemmed_text'].isna().value_counts()

stemmed_text
False    7628
Name: count, dtype: int64

In [28]:
df3['stemmed_text'] = df3['stemmed_text'].fillna(" ")

In [29]:
X = df3['stemmed_text'].values
Y = df3['sentiments'].values

### Splitting data for training

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y, stratify=Y, test_size = 0.2, random_state = 42)

In [31]:
{X.shape,x_train.shape, x_test.shape}

{(1526,), (6102,), (7628,)}

In [32]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

In [33]:
x_train_seq = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=100, padding='post', truncating='post')
x_test_seq = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=100, padding='post', truncating='post')

## Building LSTM model

In [34]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=128),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.build(input_shape=(None, 100))
model.summary()

In [35]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
history = model.fit(x_train_seq, y_train, epochs=10, batch_size=32, validation_data=(x_test_seq, y_test))

Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 68ms/step - accuracy: 0.6086 - loss: 0.6680 - val_accuracy: 0.6848 - val_loss: 0.5992
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 67ms/step - accuracy: 0.6576 - loss: 0.6394 - val_accuracy: 0.6101 - val_loss: 0.6720
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 66ms/step - accuracy: 0.6099 - loss: 0.6707 - val_accuracy: 0.6094 - val_loss: 0.6659
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 67ms/step - accuracy: 0.6419 - loss: 0.6482 - val_accuracy: 0.7936 - val_loss: 0.4850
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 66ms/step - accuracy: 0.8455 - loss: 0.4017 - val_accuracy: 0.8997 - val_loss: 0.2745
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 65ms/step - accuracy: 0.9332 - loss: 0.2191 - val_accuracy: 0.9161 - val_loss: 0.2557
Epoch 7/10
[1m1

### Test Accuracy

In [37]:
loss, accuracy = model.evaluate(x_test_seq, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9186 - loss: 0.2861
Test Accuracy: 0.9201


### Predicting on Examples

In [51]:
def predict(text):
    text = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=100, padding='post', truncating='post')
    prediction = model.predict(text)
    if prediction >= 0.5:
        print("sentiment: POSITIVE")
    else:
        print("sentiment: NEGATIVE") 

In [52]:
predict("This product is good!")
predict("It's okay, but not great.")
predict("Worst experience ever. Do not buy!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
sentiment: POSITIVE
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
sentiment: POSITIVE
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
sentiment: NEGATIVE


## Building GRU model

In [53]:
model_GRU = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=128),
    tf.keras.layers.GRU(64, return_sequences=True),
    tf.keras.layers.GRU(32),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model_GRU.build(input_shape=(None, 100))
model_GRU.summary()

In [54]:
model_GRU.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
history_GRU = model_GRU.fit(x_train_seq, y_train, epochs=10, batch_size=32, validation_data=(x_test_seq, y_test))

Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 79ms/step - accuracy: 0.6116 - loss: 0.6709 - val_accuracy: 0.6494 - val_loss: 0.6379
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 81ms/step - accuracy: 0.6563 - loss: 0.6625 - val_accuracy: 0.6573 - val_loss: 0.6180
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 82ms/step - accuracy: 0.6433 - loss: 0.6576 - val_accuracy: 0.6547 - val_loss: 0.6082
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 75ms/step - accuracy: 0.7800 - loss: 0.4769 - val_accuracy: 0.8899 - val_loss: 0.2902
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 76ms/step - accuracy: 0.9454 - loss: 0.1782 - val_accuracy: 0.9037 - val_loss: 0.2700
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 77ms/step - accuracy: 0.9674 - loss: 0.1126 - val_accuracy: 0.9135 - val_loss: 0.2462
Epoch 7/10
[1m1

### Test Accuracy

In [59]:
loss, accuracy = model_GRU.evaluate(x_test_seq, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9123 - loss: 0.3235
Test Accuracy: 0.9168


### Predicting on Examples

In [60]:
def predict_gru(text):
    text = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=100, padding='post', truncating='post')
    prediction = model_GRU.predict(text)
    if prediction >= 0.5:
        print("sentiment: POSITIVE")
    else:
        print("sentiment: NEGATIVE") 

In [67]:
predict_gru("I like this product. It offers so many various colors")
predict_gru("This product is not too great but its better than most of the products")
predict_gru("Worst experience ever. Do not buy!")
predict_gru("Waste of money")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
sentiment: POSITIVE
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
sentiment: POSITIVE
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
sentiment: NEGATIVE
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
sentiment: NEGATIVE


### End of notebook