In [5]:
# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load Dataset from CSV file
# Make sure to place your file path correctly here
df = pd.read_csv(r"C:\Users\Prachi Jariwala\Desktop\ML_Internship\Task2\IMDB Dataset.csv")  #load the csv file into the table

# Optional: Display first few rows
print("Sample Data:\n", df.head()) #print the 1st 5 lines 

# Step 3: Preprocessing
X = df['review'] #stores the actual values
y = df['sentiment'] #stores labels negative or positive

# Step 4: TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english') #converts text into numbers, stop_words='english' removes common words like "the", "is", "and", etc
X_vectorized = tfidf.fit_transform(X) #learns the vocabulary and transforms the text into numbers

# Step 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42) #X_vectorized=This is your TF-IDF transformed text data (i.e., reviews turned into numeric features), y=This is the target label — the sentiment (e.g., "positive", "negative"),20% data for testing and 80% for training

# Step 6: Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train) #Trains it using the training data, so it learns how to predict sentiment from text

# Step 7: Predictions & Evaluation
y_pred = model.predict(X_test) #Uses the trained model to predict the sentiment of the test reviews
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Sample Data:
                                               review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Accuracy: 0.894

Classification Report:
               precision    recall  f1-score   support

    negative       0.91      0.88      0.89      4961
    positive       0.88      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

