<a href="https://colab.research.google.com/github/Praanesh-S/AIML_MIC_rec_round1/blob/main/IMDb_Analysis_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
dataset_filename = 'IMDB Dataset.csv'
df = pd.read_csv(dataset_filename)

print(f"Dataset '{dataset_filename}' loaded successfully.")
print("First 5 rows of the dataset:")
print(df.head())
print("\n" + "="*50 + "\n")


In [None]:
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [None]:
df['cleaned_review'] = df['review'].apply(preprocess_text)

print("Text preprocessing complete.")
print("Example of original vs. cleaned review:")
print("\nOriginal:\n", df['review'][0])
print("\nCleaned:\n", df['cleaned_review'][0])
print("\n" + "="*50 + "\n")

In [None]:
X = df['cleaned_review']
y = df['sentiment']

# Convert sentiment labels ('positive'/'negative') to numerical format (1/0)
# as machine learning models require numerical input.
y = y.map({'positive': 1, 'negative': 0})

In [None]:
# Split the data into training (80%) and testing (20%) sets.
# `random_state` ensures the split is reproducible.
# `stratify` ensures the proportion of sentiments is the same in both sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Data preparation and splitting complete.")
print(f"Training set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")
print("\n" + "="*50 + "\n")

In [None]:
# Block 4: Vectorize the Text Data (Bag-of-Words)
# -----------------------------------------------------------------------------
# Initialize CountVectorizer to convert text into a matrix of token counts.
vectorizer = CountVectorizer()

# Learn the vocabulary from the training data and transform it.
X_train_vec = vectorizer.fit_transform(X_train)

# Transform the test data using the same vocabulary.
X_test_vec = vectorizer.transform(X_test)

print("Text vectorization complete.")
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())} words")
print("\n" + "="*50 + "\n")

In [None]:
# Block 5: Train the Logistic Regression Model
# -----------------------------------------------------------------------------
# Initialize the Logistic Regression model.
# `max_iter=1000` is used to ensure the optimization algorithm converges.
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model using the vectorized training data.
print("Training the Logistic Regression model...")
model.fit(X_train_vec, y_train)
print("Model training complete.")
print("\n" + "="*50 + "\n")

In [None]:
# Block 6: Evaluate the Model
# -----------------------------------------------------------------------------
# Make predictions on the unseen test data.
y_pred = model.predict(X_test_vec)

# Calculate the model's accuracy.
accuracy = accuracy_score(y_test, y_pred)

# Generate a detailed classification report.
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])

# Print the final evaluation results.
print("--- Model Evaluation Results ---")
print(f"Accuracy: {accuracy:.4f} ({accuracy:.2%})")
print("\nClassification Report:")
print(report)
print("--------------------------------")