In [16]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,Flatten
from sklearn.model_selection import train_test_split

# Generate synthetic data
data = {
    "text": [
        "I love this product, it's amazing!",
        "This is the worst purchase I ever made.",
        "Absolutely fantastic experience!",
        "I hate how slow the service is.",
        "The quality is excellent, I'm very happy.",
        "Terrible! I will never buy this again.",
        "I'm so pleased with my order, great job!",
        "Horrible experience, waste of money!",
        "It's okay, nothing special.",
        "I really enjoyed using this, very nice!"
    ],
    "sentiment": [1, 0, 1, 0, 1, 0, 1, 0, 1, 1]
}

df = pd.DataFrame(data)

# Tokenization & Padding
tokenizer = Tokenizer(num_words=50, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=10, padding='post')
labels = np.array(df['sentiment'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build LSTM Model
model = Sequential([
    Embedding(input_dim=5000,output_dim= 64,input_length=10),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=2)

# Test the model
sample_text = ["The product is really good, I love it!"]
sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_padded = pad_sequences(sample_seq, maxlen=10, padding='post')

prediction = model.predict(sample_padded)
sentiment = "Positive" if prediction > 0.5 else "Negative"
print(f"Predicted Sentiment: {sentiment}")

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 214ms/step - accuracy: 0.1333 - loss: 0.6962 - val_accuracy: 0.5000 - val_loss: 0.6926
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.5833 - loss: 0.6926 - val_accuracy: 0.5000 - val_loss: 0.6894
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.5833 - loss: 0.6874 - val_accuracy: 0.5000 - val_loss: 0.6870
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.7333 - loss: 0.6783 - val_accuracy: 0.5000 - val_loss: 0.6833
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5000 - loss: 0.6818 - val_accuracy: 0.5000 - val_loss: 0.6792
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5000 - loss: 0.6773 - val_accuracy: 0.5000 - val_loss: 0.6733
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━

In [19]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.preprocessing import OrdinalEncoder
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from imblearn.under_sampling import RandomUnderSampler



# Load the dataset
df = pd.read_csv('data/Dataset-SA.csv')
df.head()
df.dropna(inplace=True)
data = df[['Rate','Review','Summary','Sentiment']]
data = data[data['Sentiment'] !='neutral']

stpwords = set(stopwords.words('english'))  # Convert stopwords list to a set for fast lookup
stem = PorterStemmer()  # Create an instance

def clean(i):
    i = re.sub(r'READ MORE', '', i)
    i = i.lower()  # Convert to lowercase
    words = word_tokenize(i)  # Tokenize sentence
    words = [stem.stem(word) for word in words if word not in stpwords]
    words = [word for word in words if word not in string.punctuation]
    return ' '.join(words)  # Join words back into a sentence


data['Review'] = data['Review'].apply(clean)
data['Summary'] = data['Summary'].apply(clean)
ord = OrdinalEncoder(categories=[['negative','positive']])
data['Sentiment'] = ord.fit_transform(data[['Sentiment']])


# Assuming df has 'Review', 'Summary', and 'Sentiment' columns
X = data[['Review', 'Summary']]  # Selecting both text columns
y = data['Sentiment']  # Target column (1 = Positive, 0 = Negative)

# Initialize the undersampler
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# Resample the dataset
X_resampled, y_resampled = rus.fit_resample(X, y)

# Create the new balanced DataFrame
df_balanced = pd.DataFrame({'Review': X_resampled['Review'], 
                            'Summary': X_resampled['Summary'], 
                            'Sentiment': y_resampled})

# Display the new balanced dataset
print(df_balanced.head())



df_balanced.head()
df_balanced.to_csv('data/balanced.csv',index=False)
    
df_balanced["Combine"] = df_balanced['Review'] +' '+ df_balanced['Summary']
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")  # Keep top 10,000 words
tokenizer.fit_on_texts(df_balanced["Combine"])
sequences = tokenizer.texts_to_sequences(df_balanced["Combine"])
padded_sequences = pad_sequences(sequences, maxlen=100, padding="post", truncating="post")

model = Sequential()

model.add(Embedding(10000, output_dim=64,input_length=100))
model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))


                   Review                                            Summary  \
3         useless product                                    bad product fan   
8          unsatisfactori                                         bad cooler   
32  expect better product                                        bad qualiti   
48             wast money  small wire moter capac low fan speed good flow...   
53                   nice  receiv dalay 10 day cooler ok reveiv switch damag   

    Sentiment  
3         0.0  
8         0.0  
32        0.0  
48        0.0  
53        0.0  




Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12s/step - accuracy: 0.3750 - loss: 0.6938 - val_accuracy: 0.5000 - val_loss: 0.6917
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 392ms/step - accuracy: 0.6250 - loss: 0.6898 - val_accuracy: 0.5000 - val_loss: 0.6897
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 382ms/step - accuracy: 0.6250 - loss: 0.6863 - val_accuracy: 0.5000 - val_loss: 0.6880
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 687ms/step - accuracy: 0.6250 - loss: 0.6832 - val_accuracy: 0.5000 - val_loss: 0.6865
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 603ms/step - accuracy: 0.6250 - loss: 0.6802 - val_accuracy: 0.5000 - val_loss: 0.6848
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425ms/step - accuracy: 0.6250 - loss: 0.6768 - val_accuracy: 0.5000 - val_loss: 0.6829
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x19bf147f610>