# Importing Library

In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import numpy as np
import plotly.express as px

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Loading Dataset

In [3]:
# Read the text file into a DataFrame
with open('/content/Assignmentdataset.txt', 'r', encoding='utf-8') as file:
    content = file.read()

# Split the content into individual sentences
sentences = content.split('.')

# Create a DataFrame
df = pd.DataFrame(sentences, columns=['text'])

# Strip any leading/trailing whitespace
df['text'] = df['text'].str.strip()

# Word Embedding

In [4]:
# Function to preprocess the text
def preprocess_text(text):
    # Remove non-alphanumeric characters and tokenize
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return word_tokenize(text.lower())

# Function to filter out text with numeric data
def remove_numeric(text):
    # Check if the text contains any numeric characters
    return not any(char.isdigit() for char in text)

# Remove rows with numeric data
df['text'] = df['text'].apply(lambda x: x if remove_numeric(x) else '')

# Preprocess the entire dataset (exclude empty strings)
processed_texts = [preprocess_text(text) for text in df['text'] if text]

# Train Word2Vec model
model = Word2Vec(sentences=processed_texts, vector_size=100, window=5, min_count=1, workers=4)

# Prepare data for visualization
words = list(model.wv.index_to_key)
embeddings = np.array([model.wv[word] for word in words])

# Use t-SNE to reduce dimensions (3D)
tsne = TSNE(n_components=3, random_state=0)
embeddings_3d = tsne.fit_transform(embeddings)

# Create a DataFrame for Plotly
embedding_df = pd.DataFrame(embeddings_3d, columns=['x', 'y', 'z'])
embedding_df['word'] = words

# Ploting The Graph

In [5]:
# Create an interactive 3D scatter plot using Plotly
fig = px.scatter_3d(embedding_df, x='x', y='y', z='z', text='word', title='Word Embeddings')
fig.update_traces(textposition='top center')
fig.update_layout(showlegend=False)
fig.show()