<a href="https://colab.research.google.com/github/SrijaGuduru/SrijaGuduru/blob/main/%3C2203A51743%3E_%3C25%3E_Lab09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Install necessary libraries
!pip install nltk gensim scikit-learn matplotlib pandas

# Import required libraries
import os
import numpy as np
import pandas as pd
import re
import string
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive

# Download NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Mount Google Drive (for Colab users)
drive.mount('/content/drive', force_remount=True)

# Define the dataset path
dataset_path = "/content/drive/My Drive/Colab Notebooks/IRS Project/AMAZON_FASHION_5_part0.csv"

# Load dataset
try:
    df = pd.read_csv(dataset_path, encoding='latin1', delimiter="\t", names=['asin', 'reviewText', 'overall', 'category', 'summary'])
    print("✅ Dataset loaded successfully!")
except Exception as e:
    print("❌ Error loading dataset:", e)

# Ensure 'reviewText' is a string before processing
df['reviewText'] = df['reviewText'].astype(str).str.lower()

# Tokenize text
tokenized_texts = [word_tokenize(sentence) for sentence in df["reviewText"]]

# Ensure valid tokenized data
if len(tokenized_texts) == 0:
    raise ValueError("❌ Tokenized text data is empty. Check dataset preprocessing!")

# Train CBOW Model
cbow_model = Word2Vec(sentences=tokenized_texts, vector_size=50, window=5, min_count=2, sg=0)

# Train Skip-gram Model
skipgram_model = Word2Vec(sentences=tokenized_texts, vector_size=50, window=5, min_count=2, sg=1)

# Function to visualize Word2Vec embeddings
def visualize_embeddings(model):
    words = list(model.wv.index_to_key)[:100]  # Limit to first 100 words
    if len(words) < 2:
        print("❌ Not enough words to visualize.")
        return

    vectors = np.array([model.wv[word] for word in words])

    # Set perplexity dynamically (min: 5, max: 30)
    perplexity_value = max(5, min(30, len(words) - 1))

    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value)
    reduced_vectors = tsne.fit_transform(vectors)

    plt.figure(figsize=(10, 6))
    for word, coord in zip(words, reduced_vectors):
        plt.scatter(coord[0], coord[1])
        plt.annotate(word, (coord[0], coord[1]), fontsize=9)

    plt.title("Word Embeddings Visualization")
    plt.show()

# Visualize embeddings
visualize_embeddings(cbow_model)
visualize_embeddings(skipgram_model)

# Train a custom Word2Vec model with tuned hyperparameters
custom_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=3, min_count=2, sg=1, epochs=20)

# Save and load the model
custom_model.save("custom_word2vec.model")
loaded_model = Word2Vec.load("custom_word2vec.model")

# Test most similar words
try:
    print("Similar to 'fantastic':", loaded_model.wv.most_similar("fantastic"))
except KeyError:
    print("Word 'fantastic' not in vocabulary!")

# Word analogy example
try:
    print("Word analogy (king + woman - man):", loaded_model.wv.most_similar(positive=['king', 'woman'], negative=['man']))
except KeyError:
    print("One or more words missing in vocabulary!")

# Function to convert sentences to vectors
def get_sentence_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Convert dataset text into vectors
X = np.array([get_sentence_vector(sent, loaded_model) for sent in df["reviewText"]])

# Prepare classification labels (1 = Positive, 0 = Negative based on rating)
y = df['overall'].apply(lambda x: 1 if x > 3 else 0).to_numpy()

# Ensure compatible shapes
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Random Forest)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Evaluate classifier
y_pred = classifier.predict(X_test)
print("✅ Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Mounted at /content/drive
✅ Dataset loaded successfully!
❌ Not enough words to visualize.
❌ Not enough words to visualize.
Word 'fantastic' not in vocabulary!
One or more words missing in vocabulary!
Shape of X: (3320, 100)
Shape of y: (3320,)
✅ Model Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       664

    accuracy                           1.00       664
   macro avg       1.00      1.00      1.00       664
weighted avg       1.00      1.00      1.00       664

