In [6]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# 2. Load Dataset
# If using Google Colab and uploaded the file manually
df = pd.read_csv('IMDB Dataset.csv')  # Ensure the filename matches exactly

# If using Google Drive (optional)
# from google.colab import drive
# drive.mount('/content/drive')
# df = pd.read_csv('/content/drive/My Drive/path_to_your_file/IMDB Dataset.csv')

# If working locally, ensure the file path is correct
# df = pd.read_csv('path/to/IMDB Dataset.csv')

# Display the first few rows to confirm successful loading
print(df.head())

# 3. Basic Data Cleaning
df['review'] = df['review'].str.lower()  # Convert text to lowercase

# 4. Split Data
X = df['review']
y = df['sentiment'].map({'positive': 1, 'negative': 0})  # Map labels to 1 and 0

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Vectorize Text
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

# 6. Train Model
model = LogisticRegression(max_iter=200)
model.fit(X_train_vec, y_train)

# 7. Evaluate Model
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optional: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Accuracy: 0.889
Confusion Matrix:
 [[4327  634]
 [ 476 4563]]


In [4]:
from google.colab import files
uploaded = files.upload()


Saving IMDB Dataset.csv to IMDB Dataset.csv
