In [1]:
import pandas as pd
import re

# Load data from your database
import sqlite3
conn = sqlite3.connect("imdb_reviews.db")
reviews_df = pd.read_sql_query("SELECT * FROM imdb_reviews", conn)

# Define a cleaning function
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    # Remove special characters and digits
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning to the review_text column
reviews_df['cleaned_review'] = reviews_df['review_text'].apply(clean_text)

# Check for duplicates and remove them
reviews_df = reviews_df.drop_duplicates(subset=['cleaned_review'])

# Print the first few rows of cleaned data
print(reviews_df.head())


   id                                        review_text sentiment  \
0   1  I rented I AM CURIOUS-YELLOW from my video sto...  negative   
1   2  "I Am Curious: Yellow" is a risible and preten...  negative   
2   3  If only to avoid making this type of film in t...  negative   
3   4  This film was probably inspired by Godard's Ma...  negative   
4   5  Oh, brother...after hearing about this ridicul...  negative   

                                      cleaned_review  
0  i rented i am curiousyellow from my video stor...  
1  i am curious yellow is a risible and pretentio...  
2  if only to avoid making this type of film in t...  
3  this film was probably inspired by godards mas...  
4  oh brotherafter hearing about this ridiculous ...  


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(reviews_df['cleaned_review'])

# Target labels
y = reviews_df['sentiment'].apply(lambda x: 1 if x == "positive" else 0)


In [3]:
from sklearn.model_selection import train_test_split

# Train/test split (already provided)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Further split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = model.predict(X_val)

# Calculate validation metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1-Score: {val_f1:.2f}")


Validation Accuracy: 0.88
Validation F1-Score: 0.88


In [5]:
# Predict on test set
y_test_pred = model.predict(X_test)

# Generate classification report
print(classification_report(y_test, y_test_pred, target_names=["negative", "positive"]))


              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      6252
    positive       0.87      0.89      0.88      6143

    accuracy                           0.88     12395
   macro avg       0.88      0.88      0.88     12395
weighted avg       0.88      0.88      0.88     12395

