# Import Required Libraries
Import the necessary libraries, including pandas, numpy, sklearn, gensim, and wordcloud.

In [None]:
pip install --upgrade scipy gensim

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import gensim
from gensim import corpora
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Load and Explore Dataset
Load the IMDB_Reviews.csv dataset and explore its structure and contents.

In [None]:
# Load and Explore Dataset

# Load the dataset
df = pd.read_csv('IMDB_Reviews.csv')

# Display the first few rows of the dataset
df.head()

# Display the structure of the dataset
df.info()

# Display basic statistics of the dataset
df.describe()

# Check for missing values
df.isnull().sum()

# Display the distribution of sentiments
df['sentiment'].value_counts()

# Randomly sample 5,000 reviews for the assignment tasks
df_sample = df.sample(n=5000, random_state=42)

# Display the first few rows of the sampled dataset
df_sample.head()

# Random Sampling of Data
Select a random sample of 5,000 movie reviews from the dataset for the assignment tasks.

In [None]:
# Randomly sample 5,000 reviews for the assignment tasks
df_sample = df.sample(n=5000, random_state=42)

# Display the first few rows of the sampled dataset
df_sample.head()

# Text Preprocessing
Preprocess the text data by removing stop words, punctuation, and performing tokenization and lemmatization.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words, lemmatizer, and punctuation
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuation = set(string.punctuation)

# Function to preprocess text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words and punctuation, and perform lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in punctuation]
    return ' '.join(tokens)

# Apply the preprocessing function to the review column
df_sample['cleaned_review'] = df_sample['review'].apply(preprocess_text)

# Display the first few rows of the cleaned dataset
df_sample.head()

# Task 1: Topic Modeling


# Train LDA Model
Train an LDA model with at least 10 topics using the preprocessed text data.

In [None]:
# Task 1: Topic Modeling

# Create a dictionary and corpus for the LDA model
dictionary = corpora.Dictionary(df_sample['cleaned_review'].apply(lambda x: x.split()))
corpus = [dictionary.doc2bow(text.split()) for text in df_sample['cleaned_review']]

# Train the LDA model with at least 10 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)
for num_topics in topics:
    print(num_topics)

# Print Topics and Keywords
Print out the list of topics and at least 20 keywords for each topic.

In [None]:
# Print out the list of topics and at least 20 keywords for each topic
topics = lda_model.print_topics(num_words=20)
for topic in topics:
    print(topic)

# Visualize Topics with Word Cloud
Pick at least 5 topics from the LDA model and visualize each topic with a word cloud.

In [None]:
# Pick at least 5 topics from the LDA model and visualize each topic with word cloud
for i in range(5):
    plt.figure()
    wordcloud = WordCloud()
    wordcloud.generate_from_frequencies(dict(lda_model.show_topic(i, 20)))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f'Topic {i+1}')
    plt.show()

# Task 2: Sentiment Analysis using TF-IDF


In [None]:
# Task 2: Sentiment Analysis using TF-IDF

# Preprocess the text before building TF-IDF representation
df_sample['cleaned_review'] = df_sample['review'].apply(preprocess_text)

# Build TF-IDF representation from the review text
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df_sample['cleaned_review'])

# Split the dataset into training and testing sets (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, df_sample['sentiment'], test_size=0.2, random_state=42)

# Train a machine learning model (Naive Bayes) on the training set to predict sentiment of review text
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model and print the classification report
print(classification_report(y_test, y_pred))

# Build TF-IDF Representation
Build a TF-IDF representation from the review text.

In [None]:
# Task 2: Sentiment Analysis using TF-IDF

# Preprocess the text before building TF-IDF representation
df_sample['cleaned_review'] = df_sample['review'].apply(preprocess_text)

# Build TF-IDF representation from the review text
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df_sample['cleaned_review'])

# Split the dataset into training and testing sets (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, df_sample['sentiment'], test_size=0.2, random_state=42)

# Train a machine learning model (Naive Bayes) on the training set to predict sentiment of review text
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model and print the classification report
print(classification_report(y_test, y_pred))

# Split Data into Training and Testing Sets
Split the data into training and testing sets with an 80%/20% ratio.

In [24]:
# Split the dataset into training and testing sets (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, df_sample['sentiment'], test_size=0.2, random_state=42)

# Train Machine Learning Model
Train a machine learning model on the training set to predict the sentiment of review text.

In [None]:
# Train a machine learning model (Naive Bayes) on the training set to predict sentiment of review text
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model and print the classification report
print(classification_report(y_test, y_pred))

# Evaluate Model Performance
Make predictions and evaluate the model on the testing set. Print the classification report to show the prediction metrics.

In [None]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model and print the classification report
print(classification_report(y_test, y_pred))