# Environment Setup

In [1]:
!pip install pandas scikit-learn nltk



In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Data Loading & Preprocessing

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import re

# Sample Data
data = {
    'review': [
        "This movie was absolutely brilliant! The acting was superb.",
        "I hated it. The plot was predictable and boring.",
        "A truly fantastic film with a powerful message.",
        "What a waste of time. I would not recommend this.",
        "The cinematography was beautiful, but the story was weak."
    ],
    'sentiment': ['positive', 'negative', 'positive', 'negative', 'negative']
}
df = pd.DataFrame(data)

# --- Preprocessing Function ---
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    tokens = text.split()
    # Remove stopwords
    clean_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(clean_tokens)

df['clean_review'] = df['review'].apply(preprocess_text)
print(df)

                                              review sentiment  \
0  This movie was absolutely brilliant! The actin...  positive   
1   I hated it. The plot was predictable and boring.  negative   
2    A truly fantastic film with a powerful message.  positive   
3  What a waste of time. I would not recommend this.  negative   
4  The cinematography was beautiful, but the stor...  negative   

                               clean_review  
0  movie absolutely brilliant acting superb  
1             hated plot predictable boring  
2     truly fantastic film powerful message  
3                waste time would recommend  
4       cinematography beautiful story weak  


# Feature Extraction & Model Training

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Define features (X) and target (y)
X = df['clean_review']
y = df['sentiment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a machine learning pipeline
# 1. TfidfVectorizer: Converts text to TF-IDF features.
# 2. LogisticRegression: The classification model.
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluation & Prediction

In [10]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

# --- Predict on a new review ---
new_review = "The performances were incredible and the direction was masterful."
prediction = pipeline.predict([new_review])
print(f"\nNew Review: '{new_review}'")
print(f"Predicted Sentiment: {prediction[0]}")


--- Classification Report ---
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       1.0
    positive       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0


New Review: 'The performances were incredible and the direction was masterful.'
Predicted Sentiment: positive


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#Using VADAR in nltk


In [11]:
!pip install nltk



In [14]:
import nltk

# Download the sentence tokenizer model
nltk.download('punkt')
# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [17]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# The Code
This script will take a piece of text, split it into sentences, and then analyze the sentiment of each one.

In [18]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

# Sample text with mixed sentiment
text = "The cinematography was beautiful, but the story was weak. The acting was decent, though the ending felt rushed and disappointing. Overall, it's an okay film, but not something I would watch again."

# 1. Initialize VADER
analyzer = SentimentIntensityAnalyzer()

# 2. Split the text into sentences
sentences = sent_tokenize(text, language='english')

print(f"--- Analyzing Text: \"{text}\" ---\n")

# 3. Analyze each sentence
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")

    # Get the polarity scores
    scores = analyzer.polarity_scores(sentence)

    # Print the detailed scores
    print(f"Scores: {scores}")

    # Determine the overall tone of the sentence
    compound_score = scores['compound']
    if compound_score >= 0.05:
        tone = "Positive"
    elif compound_score <= -0.05:
        tone = "Negative"
    else:
        tone = "Neutral"

    print(f"Tone: {tone}\n" + "-"*30)

--- Analyzing Text: "The cinematography was beautiful, but the story was weak. The acting was decent, though the ending felt rushed and disappointing. Overall, it's an okay film, but not something I would watch again." ---

Sentence 1: The cinematography was beautiful, but the story was weak.
Scores: {'neg': 0.289, 'neu': 0.526, 'pos': 0.184, 'compound': -0.34}
Tone: Negative
------------------------------
Sentence 2: The acting was decent, though the ending felt rushed and disappointing.
Scores: {'neg': 0.242, 'neu': 0.758, 'pos': 0.0, 'compound': -0.4939}
Tone: Negative
------------------------------
Sentence 3: Overall, it's an okay film, but not something I would watch again.
Scores: {'neg': 0.0, 'neu': 0.873, 'pos': 0.127, 'compound': 0.1154}
Tone: Positive
------------------------------
