<a href="https://colab.research.google.com/github/Srihimavardhan/SENTIMENT_ANALYSIS/blob/main/CodTech_Task_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import seaborn as sns
import matplotlib.pyplot as plt
import re
import logging

In [2]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
class SentimentAnalyzer:
    def __init__(self):
        # Configure logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

        # Initialize NLTK resources
        self._initialize_nltk()

        # Initialize preprocessing tools
        self.tokenizer = RegexpTokenizer(r'\w+')  # Use RegexpTokenizer instead of word_tokenize
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.model = None

    def _initialize_nltk(self):
        """Download required NLTK resources safely"""
        try:
            nltk.download('stopwords', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('omw-1.4', quiet=True)  # Required for newer versions of NLTK
        except Exception as e:
            self.logger.error(f"Error downloading NLTK resources: {str(e)}")
            raise

    def create_sample_data(self):
        """Create a sample dataset of movie reviews"""
        reviews = [
            "This movie was fantastic! Great acting and storyline",
            "Terrible waste of time. Poor acting and boring plot",
            "Really enjoyed this film, would watch again",
            "Not worth the money, very disappointed",
            "Average movie, nothing special but okay",
            "One of the best movies I've ever seen",
            "Complete disaster, avoid at all costs",
            "Pretty good entertainment value",
            "Absolutely loved every minute of it",
            "Could have been better, somewhat disappointing"
        ]

        ratings = [5, 1, 4, 1, 3, 5, 1, 4, 5, 2]

        return pd.DataFrame({
            'text': reviews,
            'rating': ratings
        })

    def preprocess_text(self, text):
        """Clean and preprocess text data"""
        try:
            # Convert to lowercase and remove special characters
            text = str(text).lower()
            text = re.sub(r'[^a-zA-Z\s]', '', text)

            # Tokenize using RegexpTokenizer
            tokens = self.tokenizer.tokenize(text)

            # Remove stopwords and lemmatize
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens
                     if token not in self.stop_words and len(token) > 2]

            return ' '.join(tokens)
        except Exception as e:
            self.logger.error(f"Error preprocessing text: {str(e)}")
            return ""

    def prepare_data(self, df):
        """Prepare data for training"""
        self.logger.info("Preprocessing text data...")

        # Create copy to avoid modifying original
        df_processed = df.copy()

        # Preprocess reviews
        df_processed['processed_text'] = df_processed['text'].apply(self.preprocess_text)

        # Convert ratings to sentiment
        df_processed['sentiment'] = df_processed['rating'].apply(
            lambda x: 'negative' if x <= 2 else 'positive' if x >= 4 else 'neutral'
        )

        return df_processed

    def train_model(self, X_train, y_train):
        """Train the sentiment analysis model"""
        try:
            self.logger.info("Training model...")

            # Create pipeline with improved parameters
            self.model = Pipeline([
                ('tfidf', TfidfVectorizer(
                    max_features=5000,
                    min_df=2,
                    max_df=0.95,
                    ngram_range=(1, 2)
                )),
                ('classifier', LogisticRegression(
                    max_iter=1000,
                    class_weight='balanced',
                    random_state=42
                ))
            ])

            self.model.fit(X_train, y_train)

        except Exception as e:
            self.logger.error(f"Error training model: {str(e)}")
            raise

    def evaluate_model(self, X_test, y_test):
        """Evaluate model performance"""
        if self.model is None:
            raise ValueError("Model has not been trained yet")

        try:
            # Make predictions
            y_pred = self.model.predict(X_test)

            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))

            # Plot confusion matrix
            self._plot_confusion_matrix(y_test, y_pred)

        except Exception as e:
            self.logger.error(f"Error evaluating model: {str(e)}")
            raise

    def _plot_confusion_matrix(self, y_true, y_pred):
        """Plot confusion matrix"""
        plt.figure(figsize=(8, 6))
        cm = confusion_matrix(y_true, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()

    def analyze_new_text(self, texts):
        """Analyze sentiment of new texts"""
        if self.model is None:
            raise ValueError("Model has not been trained yet")

        try:
            # Ensure texts is a list
            if isinstance(texts, str):
                texts = [texts]

            # Preprocess new texts
            processed_texts = [self.preprocess_text(text) for text in texts]

            # Predict sentiments
            predictions = self.model.predict(processed_texts)
            probabilities = self.model.predict_proba(processed_texts)

            # Create results dataframe
            results = pd.DataFrame({
                'text': texts,
                'sentiment': predictions,
                'confidence': np.max(probabilities, axis=1)
            })

            return results

        except Exception as e:
            self.logger.error(f"Error analyzing new text: {str(e)}")
            raise

In [4]:
def main():
    try:
        # Initialize analyzer
        analyzer = SentimentAnalyzer()

        # Create sample data
        df = analyzer.create_sample_data()

        # Print dataset info
        print("\nDataset Info:")
        print(df.info())
        print("\nSample of raw data:")
        print(df.head())

        # Prepare data
        df_processed = analyzer.prepare_data(df)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            df_processed['processed_text'],
            df_processed['sentiment'],
            test_size=0.2,
            random_state=42,
            stratify=df_processed['sentiment']  # Ensure balanced split
        )

        # Train and evaluate model
        analyzer.train_model(X_train, y_train)
        analyzer.evaluate_model(X_test, y_test)

        # Test with new reviews
        new_reviews = [
            "This product is amazing! I absolutely love it!",
            "Terrible experience, would not recommend to anyone.",
            "It's okay, nothing special but gets the job done."
        ]

        results = analyzer.analyze_new_text(new_reviews)
        print("\nSentiment Analysis Results for New Reviews:")
        print(results.to_string(index=False))

    except Exception as e:
        logging.error(f"An error occurred in main: {str(e)}", exc_info=True)

if __name__ == "__main__":
    main()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10 non-null     object
 1   rating  10 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 292.0+ bytes
None

Sample of raw data:
                                                text  rating
0  This movie was fantastic! Great acting and sto...       5
1  Terrible waste of time. Poor acting and boring...       1
2        Really enjoyed this film, would watch again       4
3             Not worth the money, very disappointed       1
4            Average movie, nothing special but okay       3


ERROR:root:An error occurred in main: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Traceback (most recent call last):
  File "<ipython-input-4-4820913fb441>", line 19, in main
    X_train, X_test, y_train, y_test = train_test_split(
                                       ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_split.py", line 2872, in train_test_split
    train, test = next(cv.split(X=arrays[0], y=stratify))
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_split.py", line 1909, in split
    for train, test in self._iter_indices(X, y, groups):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/m