In [1]:
#This code imports the pandas library, loads a CSV file named 'IMDB Dataset.csv' into a pandas DataFrame,
#and then displays the first few rows of the dataset.
#The import pandas as pd command imports the pandas library, pd.read_csv('IMDB Dataset.csv') reads the CSV file into a DataFrame,
#and data.head() shows the first five rows of
#the data, providing a quick overview of its structure and contents.
import pandas as pd

# Load the dataset
data = pd.read_csv('IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Download the NLTK stopwords
nltk.download('stopwords')

# Load the English stopwords from NLTK and store them in a set for faster lookup
stop_words = set(stopwords.words('english'))

# Text cleaning function
def clean_text(text):
    # Remove HTML tags using a regular expression
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation by keeping only alphanumeric characters and spaces
    text = re.sub(r'[^\w\s]', '', text)
    # Convert all text to lowercase
    text = text.lower()
    # Remove stopwords from the text
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply the text cleaning function to the 'review' column in the DataFrame
data['cleaned_review'] = data['review'].apply(clean_text)

# Split the data into input features (X) and target labels (y)
X = data['cleaned_review']  # The cleaned reviews are the input features
y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # Convert sentiment to binary (1 for positive, 0 for negative)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer with a maximum of 5000 features
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data and transform the training data into TF-IDF features
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the testing data into TF-IDF features using the already fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)

#In this code, we first clean the text data by removing HTML tags, punctuation,
#and stopwords, then we convert the text to lowercase. After cleaning, the data is split into training and testing sets.
#Finally, we convert the cleaned text data into numerical features using TF-IDF vectorization, which is then ready for machine learning model
#training and evaluation.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from sklearn.linear_model import LogisticRegression

# Initialize a Logistic Regression model
model = LogisticRegression()

# Train the Logistic Regression model using the training data
# X_train_tfidf contains the TF-IDF features of the training set
# y_train contains the corresponding labels (0 for negative, 1 for positive)
model.fit(X_train_tfidf, y_train)

#In this code, a Logistic Regression model is first created and then trained on the TF-IDF vectorized training data.
#The model learns the relationship between the features (X_train_tfidf) and the labels (y_train), which it will use later to predict sentiments on new, unseen data.

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on the test set
# Use the trained model to predict sentiments on the test set
# X_test_tfidf contains the TF-IDF features of the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
# Calculate the accuracy: the proportion of correctly predicted labels out of all predictions
accuracy = accuracy_score(y_test, y_pred)
# Calculate the precision: the proportion of true positive predictions out of all positive predictions made by the model
precision = precision_score(y_test, y_pred)
# Calculate the recall: the proportion of true positive predictions out of all actual positive labels
recall = recall_score(y_test, y_pred)
# Calculate the F1 Score: the harmonic mean of precision and recall, providing a balance between the two
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

#In this code, predictions are made on the test set using the trained Logistic Regression model.
#The model's performance is then evaluated using four metrics: accuracy, precision, recall, and F1 score.
#These metrics provide a comprehensive view of how well the model is performing, and the results are printed with four decimal places for precision.








Accuracy: 0.8880
Precision: 0.8785
Recall: 0.9026
F1 Score: 0.8904


In [7]:
# Function to predict sentiment of a new review
def predict_sentiment(review):
    # Clean the input review using the previously defined text cleaning function
    cleaned_review = clean_text(review)

    # Vectorize the cleaned review using the same TF-IDF vectorizer that was used for training
    vectorized_review = vectorizer.transform([cleaned_review])

    # Use the trained model to predict the sentiment of the vectorized review
    prediction = model.predict(vectorized_review)

    # Return 'positive' if the prediction is 1, otherwise return 'negative'
    return 'positive' if prediction[0] == 1 else 'negative'

# Example usage
new_review = "The movie was fantastic and I enjoyed every moment of it."

# Predict and print the sentiment of the new review
print(f'Sentiment: {predict_sentiment(new_review)}')

#This code defines a function, predict_sentiment, that takes a new review as input and predicts whether its sentiment is positive or negative.
#The review is first cleaned, then vectorized using the same TF-IDF vectorizer used during training, and finally, the sentiment is predicted using
#the trained Logistic Regression model. The function returns 'positive' if the prediction is positive (1), and 'negative' if it is negative (0).
#An example usage of the function is provided, where a new review is passed to the function, and its sentiment is printed.

Sentiment: positive
