## Sentiment Analysis using Logistic Regression

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the IMDB dataset from CSV
dataset_path = '/Users/ritikanigam/Downloads/IMDB Dataset.csv'  
df = pd.read_csv(dataset_path)

# Preprocessing function
def preprocess_text(text):
    # Remove non-alphanumeric characters and lowercasing
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I|re.A)
    text = text.lower().strip()
    
    # Tokenization using NLTK
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize tokens
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join tokens back into a string
    preprocessed_text = ' '.join(filtered_tokens)
    
    return preprocessed_text

# Apply preprocessing to the 'review' column
df['review'] = df['review'].apply(preprocess_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=40000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Save predictions to a DataFrame
results_df = pd.DataFrame({'Review': X_test, 'True Sentiment': y_test, 'Predicted Sentiment': y_pred})

# Save DataFrame to CSV
results_df.to_csv('NLPpredictions.csv', index=False)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ritikanigam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ritikanigam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ritikanigam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.8967

Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [32]:
# Function to predict sentiment of a single sample text
def predict_sentiment(sample_text):
    # Preprocess the sample text
    preprocessed_text = preprocess_text(sample_text)
    
    # Transform the preprocessed text using the fitted TF-IDF vectorizer
    sample_tfidf = tfidf_vectorizer.transform([preprocessed_text])
    
    # Predict the sentiment using the trained model
    prediction = model.predict(sample_tfidf)
    
    return prediction[0]

# Test the model with a sample text
sample_text = "I really enjoyed this movie. The plot was thrilling and the acting was superb!"
predicted_sentiment = predict_sentiment(sample_text)
print(f"The predicted sentiment for the sample text is: {predicted_sentiment}")

The predicted sentiment for the sample text is: positive


## Sentiment Analysis using BERT

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import torch

# Load the IMDB dataset from CSV
dataset_path = '/Users/ritikanigam/Downloads/IMDB Dataset.csv'  
df = pd.read_csv(dataset_path)

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review'].tolist(), 
    df['sentiment'].map(lambda x: 1 if x == 'positive' else 0).tolist(), 
    test_size=0.2, 
    random_state=42
)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Convert to torch Dataset
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

# Initialize BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Training arguments: configuration setting for traing the model
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.001,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=50,
    evaluation_strategy="epoch"
)

# Data collator to handle dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer object for training the model
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    data_collator=data_collator          # data collator for dynamic padding
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

# Make predictions
predictions = trainer.predict(test_dataset)
preds = torch.argmax(predictions.predictions, axis=-1)

# Save predictions to a DataFrame
results_df = pd.DataFrame({
    'Review': test_texts, 
    'True Sentiment': test_labels, 
    'Predicted Sentiment': preds.numpy()
})

# Save DataFrame to CSV
results_df.to_csv('NLPpredictions_transformers.csv', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/7500 [00:00<?, ?it/s]

{'loss': 0.3739, 'grad_norm': nan, 'learning_rate': 1.25e-05, 'epoch': 0.02}


KeyboardInterrupt: 