In [9]:
!pip install pandas numpy scikit-learn nltk




[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
import nltk

# Download all required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [24]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [25]:
class SentimentAnalyzer:
    def __init__(self, max_features=5000, test_size=0.2):
        self.max_features = max_features
        self.test_size = test_size
        self.vectorizer = None
        self.model = None
        self.stop_words = set(stopwords.words('english'))

    def load_and_prepare_data(self, file_path, nrows=None):
        try:
            df = pd.read_csv(file_path, nrows=nrows)
            print(f"Loaded {len(df)} reviews")
            
            if not all(col in df.columns for col in ['review', 'sentiment']):
                raise ValueError("Dataset missing required columns 'review' and 'sentiment'")
            
            df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
            df = df.dropna()
            return df
            
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            return None

    def preprocess_text(self, text):
        try:
            # Simpler preprocessing without word_tokenize
            text = str(text).lower()
            text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
            text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
            text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
            text = ' '.join(text.split())  # Remove extra whitespace
            
            # Split into words and remove stopwords
            words = text.split()
            words = [word for word in words if word not in self.stop_words and len(word) > 2]
            
            return ' '.join(words)
            
        except Exception as e:
            print(f"Error in preprocessing: {str(e)}")
            return text

    def train(self, df):
        try:
            print("Preprocessing reviews...")
            df['processed_review'] = df['review'].apply(self.preprocess_text)
            
            X_train, X_test, y_train, y_test = train_test_split(
                df['processed_review'], 
                df['sentiment'],
                test_size=self.test_size,
                random_state=42,
                stratify=df['sentiment']
            )
            
            print("Performing TF-IDF vectorization...")
            self.vectorizer = TfidfVectorizer(
                max_features=self.max_features,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.95
            )
            X_train_tfidf = self.vectorizer.fit_transform(X_train)
            X_test_tfidf = self.vectorizer.transform(X_test)
            
            print("Training model...")
            self.model = LogisticRegression(random_state=42, max_iter=1000)
            self.model.fit(X_train_tfidf, y_train)
            
            y_pred = self.model.predict(X_test_tfidf)
            print("\nModel Performance:")
            print(classification_report(y_test, y_pred))
            
        except Exception as e:
            print(f"Error in training: {str(e)}")

    def predict_sentiment(self, text, return_probability=False):
        try:
            if not self.model or not self.vectorizer:
                raise ValueError("Model not trained yet")
            
            processed_text = self.preprocess_text(text)
            text_tfidf = self.vectorizer.transform([processed_text])
            prediction = self.model.predict(text_tfidf)[0]
            probability = self.model.predict_proba(text_tfidf)[0]
            
            result = {
                'sentiment': 'Positive' if prediction == 1 else 'Negative',
                'confidence': float(probability.max())
            }
            
            if return_probability:
                result['probability'] = {
                    'negative': float(probability[0]),
                    'positive': float(probability[1])
                }
            
            return result
            
        except Exception as e:
            print(f"Error in prediction: {str(e)}")
            return None

In [26]:
# Initialize analyzer
analyzer = SentimentAnalyzer(max_features=5000)

# Load smaller subset of data first
df = analyzer.load_and_prepare_data("C:/Users/ADMIN/Downloads/archive (2)/IMDB Dataset.csv", nrows=1000)

Loaded 1000 reviews


In [27]:
if df is not None:
    analyzer.train(df)

Preprocessing reviews...
Performing TF-IDF vectorization...
Training model...

Model Performance:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       100
           1       0.83      0.82      0.82       100

    accuracy                           0.82       200
   macro avg       0.83      0.82      0.82       200
weighted avg       0.83      0.82      0.82       200



In [28]:
# Test with new reviews
test_reviews = [
    "This movie was absolutely fantastic! The acting was superb and the plot was engaging throughout.",
    "What a terrible waste of time. The plot made no sense and the acting was wooden."
]

print("\nTesting new reviews:")
for review in test_reviews:
    result = analyzer.predict_sentiment(review, return_probability=True)
    print(f"\nReview: {review}")
    print(f"Predicted Sentiment: {result['sentiment']}")
    print(f"Confidence: {result['confidence']:.2f}")


Testing new reviews:

Review: This movie was absolutely fantastic! The acting was superb and the plot was engaging throughout.
Predicted Sentiment: Positive
Confidence: 0.57

Review: What a terrible waste of time. The plot made no sense and the acting was wooden.
Predicted Sentiment: Negative
Confidence: 0.82
