In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from bs4 import BeautifulSoup
import re
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df=pd.read_csv('C:/Users/hp/Downloads/IMDB Dataset.csv')
df.head()

# Function to clean the reviews
def clean_review(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove non-letters and convert to lowercase
    text = re.sub("[^a-zA-Z]", " ", text).lower()
    return text

# Apply cleaning function to the reviews
df['cleaned_review'] = df['review'].apply(clean_review)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# Predict sentiment on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8947
Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Confusion Matrix:
[[4375  586]
 [ 467 4572]]


In [None]:
'''BeautifulSoup is a Python library used 
for parsing HTML and XML documents. 
In the context of the sentiment analysis model, 
BeautifulSoup is used to clean the text data by 
removing any HTML tags that might be present in the movie reviews.'''