# Assignment Solution - NLP - **Group 121**

## Step 1: Install & Import Required Libraries

In [22]:
#Importing core data libraries
import pandas as pd# For loading and working with tabular data (like DataFrames)
import numpy as np# For numerical operations and array manipulations

#TF-IDF and similarity
from sklearn.feature_extraction.text import TfidfVectorizer  # To convert text into numerical features
from sklearn.metrics.pairwise import cosine_similarity       # To measure similarity between vectors
from sklearn.decomposition import PCA                        # For reducing feature dimensions (used in plotting)

#Visualization libraries
import matplotlib.pyplot as plt    # For creating plots and visualizations

#Natural Language Toolkit (NLTK) for text processing
import nltk
from nltk.tokenize import word_tokenize                    # To split sentences into individual words
from nltk.corpus import stopwords                          # To remove common stop words like "is", "the", etc.
from nltk.stem import PorterStemmer, WordNetLemmatizer     # To reduce words to their root form

#For handling punctuation
import string

#Suppress unnecessary warning messages
import warnings
warnings.filterwarnings("ignore")

#Utility for wrapping long text for better readability in print
import textwrap

#Statistical tools for language modeling
from collections import defaultdict, Counter               # For efficient counting of unigrams and bigrams
import math                                                # For log and exponential calculations

#Install and import external library used to adjust text labels in plots
!pip install adjustText --quiet                            # Install only if not already available
from adjustText import adjust_text                         # For avoiding label overlap in PCA plots


In [23]:
# Increasing the display size
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 200)


## Step 2: Download NLTK Resources

In [24]:
# Downloading required libraries
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Step 3: Upload and Load Dataset

In [25]:
# Reading the given input file
from google.colab import files
df = pd.read_csv('/content/IMDB.csv')
df.shape


(620, 2)

In [26]:
# Previewing the first 2 records of the input file
df[:2]


Unnamed: 0,ID,review
0,1,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me..."
1,2,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire p..."


## Part I: Bigram Language Model - Sentence Comparison

In [27]:
# Note: The bigram language model code was truncated in the original input.
# Please provide the complete code for this section if needed.
# Placeholder for bigram model implementation.


In [None]:
#Notify that PCA plot generation is starting
print("\nPlotting PCA for Top 30 Most Important Words (by TF-IDF score)...")

# Step 1: Compute average TF-IDF score for each word across all documents
avg_tfidf = np.asarray(X_tfidf.mean(axis=0)).ravel()

# Step 2: Select the top 30 words with the highest average TF-IDF scores
top_n = 30
top_indices = avg_tfidf.argsort()[::-1][:top_n]  # Sort indices in descending order
selected_words = [vocab[i] for i in top_indices]  # Get corresponding word list
selected_vectors = tfidf_matrix[top_indices]      # Get corresponding TF-IDF vectors

# Step 3: Apply PCA to reduce TF-IDF vectors from high-dimension to 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(selected_vectors)  # Each word now has 2 coordinates

# Step 4: Create a scatter plot of these 2D points
plt.figure(figsize=(8, 8))
plt.scatter(reduced[:, 0], reduced[:, 1], color='skyblue', edgecolor='k', s=120)

# Step 5: Add labels (words) next to each point
texts = []
for i, word in enumerate(selected_words):
    texts.append(plt.text(reduced[i, 0], reduced[i, 1], word, fontsize=10))

# Step 6: Adjust text labels to prevent overlap and add arrows if needed
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='gray', lw=1))

# Step 7: Final touches – titles, labels, and layout
plt.title("Top 30 TF-IDF Words Visualized via PCA", fontsize=16)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()
