**Simple Sentiment Analysis**

In [108]:
import pandas as pd
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Create a simple dataset
data = {
    'review': [
        'I love this movie, it is fantastic!',
        'This film was terrible and boring.',
        'Absolutely wonderful and a great experience.',
        'I did not like the movie at all.',
        'The plot was dull and uninteresting.',
        'Brilliant performance by the actors.',
        'The movie was so bad, I walked out.',
        'One of the best movies I have ever seen.',
        'It was an okay movie, nothing special.',
        'Horrible, I would not recommend it to anyone.'
    ],
    'sentiment': [
        'positive', 'negative', 'positive', 'negative', 'negative',
        'positive', 'negative', 'positive', 'neutral', 'negative'
    ]
}

df = pd.DataFrame(data)

# Encode the sentiment labels
df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0, 'neutral': 0})




# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub('<br />', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text

df['review'] = df['review'].apply(preprocess_text)

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['review'])
X = tokenizer.texts_to_sequences(df['review'])
X = pad_sequences(X, maxlen=20)

# Encode labels
y = df['sentiment'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=2000, output_dim=128, input_length=20))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=2, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')

# Predict sentiment for new reviews
new_reviews = ["I loved the movie, it was fantastic!", "The movie was terrible and boring."]
new_reviews_preprocessed = [preprocess_text(review) for review in new_reviews]
new_reviews_sequences = tokenizer.texts_to_sequences(new_reviews_preprocessed)
new_reviews_padded = pad_sequences(new_reviews_sequences, maxlen=20)
predictions = model.predict(new_reviews_padded)

# Print raw predictions to debug
print("Raw predictions:", predictions)

# Convert predictions to "positive" or "negative"
predicted_labels = ['positive' if pred >= 0.5 else 'negative' for pred in predictions]

for review, sentiment in zip(new_reviews, predicted_labels):
    print(f'Review: "{review}" - Sentiment: {sentiment}')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.0
Raw predictions: [[0.51003903]
 [0.49818763]]
Review: "I loved the movie, it was fantastic!" - Sentiment: positive
Review: "The movie was terrible and boring." - Sentiment: negative


In [109]:
import pandas as pd
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Create a simple dataset

# Load the dataset
imdb = pd.read_csv('./imdb.csv', sep='\t',encoding='latin-1')
df = imdb.head(15000).copy()

# Encode the sentiment labels
df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})
# Drop the 'id' column from the DataFrame
df.drop('id', axis=1, inplace=True)


In [110]:
print(df)

                                                  review  sentiment
0      With all this stuff going down at the moment w...          1
1      \The Classic War of the Worlds\" by Timothy Hi...          1
2      The film starts with a manager (Nicholas Bell)...          0
3      It must be assumed that those who praised this...          0
4      Superbly trashy and wondrously unpretentious 8...          1
...                                                  ...        ...
14995  The Last Station, director Michael Hoffman's m...          0
14996  Silly, often ridiculous romp involving the lan...          0
14997  Was this the greatest movie that I have ever s...          0
14998  We've all seen this story a hundred times. You...          0
14999  This crime thriller is sort of like a film noi...          1

[15000 rows x 2 columns]


In [112]:


# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub('<br />', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text

df['review'] = df['review'].apply(preprocess_text)

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['review'])
X = tokenizer.texts_to_sequences(df['review'])
X = pad_sequences(X, maxlen=20)

# Encode labels
y = df['sentiment'].values

In [113]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Build the LSTM model
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=20))  # Adjust input_dim and input_length accordingly
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=3, batch_size=4, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')



Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.7343999743461609


In [114]:
# Predict sentiment for new reviews
new_reviews = [ "Well done Al Gore!","I loved the movie, it was fantastic!", "terrible and boring."]
new_reviews_preprocessed = [preprocess_text(review) for review in new_reviews]
new_reviews_sequences = tokenizer.texts_to_sequences(new_reviews_preprocessed)
new_reviews_padded = pad_sequences(new_reviews_sequences, maxlen=20)
predictions = model.predict(new_reviews_padded)

# Print raw predictions to debug
print("Raw predictions:", predictions)

# Convert predictions to "positive" or "negative"
predicted_labels = ['positive' if pred >= 0.5 else 'negative' for pred in predictions]

for review, sentiment in zip(new_reviews, predicted_labels):
    print(f'Review: "{review}" - Sentiment: {sentiment}')

Raw predictions: [[0.02149437]
 [0.9439366 ]
 [0.00379151]]
Review: "Well done Al Gore!" - Sentiment: negative
Review: "I loved the movie, it was fantastic!" - Sentiment: positive
Review: "terrible and boring." - Sentiment: negative


**Sentiment Analysis using Hugging Face Example**

In [5]:
pip install transformers


Collecting transformersNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)
     ---------------------------------------- 9.3/9.3 MB 10.3 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl (402 kB)
     ------------------------------------- 402.6/402.6 kB 12.3 MB/s eta 0:00:00
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp39-none-win_amd64.whl (287 kB)
     -------------------------------------- 287.9/287.9 kB 8.7 MB/s eta 0:00:00
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp39-none-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 10.9 MB/s eta 0:00:00
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)
  Downloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
     ------------------------------------- 177.6/177.6 kB 10.5 MB/s eta 0:00:00
Installing collected packages: safetensors, fsspec

In [6]:
pip install torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from transformers import pipeline

# Load the sentiment-analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis')

# Example texts
texts = [
    "I love this product! It works great.",
    "This is the worst service I have ever received.",
    "It's an average experience, nothing special."
]

# Perform sentiment analysis
results = sentiment_pipeline(texts)

# Print the results
for text, result in zip(texts, results):
    print(f"Text: {text}\nSentiment: {result['label']}, Score: {result['score']}\n")





No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Text: I love this product! It works great.
Sentiment: POSITIVE, Score: 0.9998793601989746

Text: This is the worst service I have ever received.
Sentiment: NEGATIVE, Score: 0.9997833371162415

Text: It's an average experience, nothing special.
Sentiment: NEGATIVE, Score: 0.9993513226509094

