<a href="https://colab.research.google.com/github/RADHIKA281005/23BKT0078_L37_MJ_radhika/blob/main/ai_movie_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install libraries
!pip install lime
!pip install shap

# Import all libraries
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
# Load the CSV file you uploaded
df = pd.read_csv('IMDB Dataset.csv')  # <-- THIS IS THE FIX

# --- NEW STEP ---
# The 'sentiment' column is text ("positive", "negative").
# We need to convert it to numbers (1, 0) for the model.
df['sentiment_numeric'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# --- NEW STEP ---
# We must split our data into a training set and a testing set.
X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment_numeric'],
    test_size=0.2,        # 20% of data will be for testing
    random_state=42       # Ensures we get the same split every time
)

print("Data loaded and split successfully!")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Data loaded and split successfully!
Training samples: 40000
Testing samples: 10000


In [3]:
from google.colab import files

print("Please upload your 'imdb.csv' file:")
uploaded = files.upload()

# This will confirm the upload is complete
for fn in uploaded.keys():
  print(f"\nUser uploaded file '{fn}' successfully!")

print("\nUpload complete! You can now run the next cell.")

Please upload your 'imdb.csv' file:


Saving IMDB Dataset.csv to IMDB Dataset.csv

User uploaded file 'IMDB Dataset.csv' successfully!

Upload complete! You can now run the next cell.


In [5]:
# Create the text "vectorizer" and the model
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', max_features=500)
model = LogisticRegression(random_state=42)

# Bundle them into a "pipeline"
pipeline = make_pipeline(vectorizer, model)

print("Training the model with max_features=500...")
pipeline.fit(X_train, y_train)
print("Model trained successfully!")

# Test the model on the 'test set'
accuracy = pipeline.score(X_test, y_test)
print(f"Model Accuracy (with 500 features): {accuracy * 100:.2f}%")

Training the model with max_features=500...
Model trained successfully!
Model Accuracy (with 500 features): 84.16%


In [6]:
# Pick a sentence from the test set to explain
# We use .iloc[12] to get the 12th item from our X_test set
idx_to_explain = 12
text_to_explain = X_test.iloc[idx_to_explain]
true_label = y_test.iloc[idx_to_explain]

print(f"--- Explaining Text (Index {idx_to_explain}) ---")
print(text_to_explain)
print(f"\nTrue Category (0=neg, 1=pos): {true_label}")
print(f"Model Prediction: {pipeline.predict([text_to_explain])[0]}")

# Create the LIME explainer
explainer_lime = LimeTextExplainer(class_names=['negative', 'positive'])

# Get the explanation
exp_lime = explainer_lime.explain_instance(text_to_explain,
                                          pipeline.predict_proba,
                                          num_features=10,
                                          num_samples=1000)

print("\n--- LIME Explanation (Top 10 words) ---")
print(exp_lime.as_list())

--- Explaining Text (Index 12) ---
It's made in 2007 and the CG is bad for a movie made in 1998. At one part in the movie there is a stop motion shot of a dinosaur that actually looks good, but this just makes the extremely amateur work on the CG stuff look even worse.<br /><br />The writing, acting, directing and everything else in this movie is just terrible. This is as bad as, if not worse than Raptor Island and 100 million BC... pure crap! Again, as with the other movies, the only scary part about this movie is that it actually got made and is now being aired on the sci-fi channel.<br /><br />I still can't understand how they somehow get people who do have some acting skills to act in these movies and then somehow get them to act as terrible as everyone else in the movie.<br /><br />For those of you who are unsure, the other poster is obviously being sarcastic in his review... or he is one of the people who worked on this movie.

True Category (0=neg, 1=pos): 0
Model Prediction: 0


In [7]:
print("\n--- Running SHAP Explanation ---")

# --- We use X_train for the background text ---
background_text = shap.sample(X_train, 100)
background_vectors_dense = vectorizer.transform(background_text).toarray()
text_vector_dense = vectorizer.transform([text_to_explain]).toarray()

# Create a modern, unified SHAP explainer
explainer = shap.Explainer(model.predict_proba, background_vectors_dense,
                           feature_names=vectorizer.get_feature_names_out())

# Get the SHAP values
shap_explanation = explainer(text_vector_dense, max_evals=1001)

print(f"\nSHAP explanation for: 'positive' (class 1)")

# Get the values for class 1 ("positive")
shap_values_class1 = shap_explanation.values[0, :, 1]
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame to hold features and their SHAP values
shap_df = pd.DataFrame({
    'feature': feature_names,
    'shap_value': shap_values_class1
})

# Create a new column for the absolute SHAP value
shap_df['abs_shap'] = shap_df['shap_value'].abs()

# Sort by the absolute value, descending
shap_df_sorted = shap_df.sort_values(by='abs_shap', ascending=False)

# Print the top 10 features
print("\n--- SHAP Explanation (Top 10 features) ---")
print(shap_df_sorted[shap_df_sorted['shap_value'] != 0].head(10)[['feature', 'shap_value']])


--- Running SHAP Explanation ---

SHAP explanation for: 'positive' (class 1)

--- SHAP Explanation (Top 10 features) ---
      feature  shap_value
426  terrible   -0.125411
488     worse   -0.096534
24        bad   -0.062275
79       crap   -0.040843
256     looks   -0.023357
493   writing   -0.018062
171     great   -0.016428
219      just   -0.016382
4      acting   -0.015913
265     makes    0.014418
