In [1]:
!pip3 install nltk
!pip install -U gensim




In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize text
    words = nltk.word_tokenize(text)

    # Remove stopwords and perform lemmatization
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(filtered_words)

# Read queries from a text file
with open('HR_Transport_queries.txt', 'r') as file:
    queries = file.readlines()

preprocessed_queries = [preprocess_text(query) for query in queries]
print(preprocessed_queries)

['policy aim assist synechron india employee commuting', 'policy apply term employee individual', 'various mode transportation covered policy', 'could provide detail bus transport timing different shift', 'exception availability bus transport certain location', 'cab transport service work billable project', 'circumstance employee production support avail cab facility', 'considered odd shift employee avail cab service shift', 'employee request cab service based project requirement', 'elaborate process making special request cab service case medical emergency', 'situation employee use cab service official duty purpose', 'find detailed information bus route registration process', 'document provides comprehensive information cab operation find', 'policy ensure safety comfort employee commute', 'provision place ensure compliance defined policy regarding transport service', 'purpose synechron escort accompany female employee airport pickup drop', 'time escort required accompany female employ

In [5]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Create a TF-IDF vectorizer
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_queries)

import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

from sklearn.manifold import TSNE

from sklearn.decomposition import PCA

# Train a Word2Vec model on your preprocessed queries
sentences = [query.split() for query in preprocessed_queries]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)  # You can adjust parameters

# Create embeddings for each query by averaging word embeddings
query_embeddings = np.array([np.mean([word2vec_model.wv[word] for word in sentence], axis=0) for sentence in sentences])
query_embeddings.shape


(376, 100)

In [None]:
# Perform K-Means clustering on the query embeddings
num_clusters = 2  # You can adjust this value
kmeans = KMeans(n_clusters=num_clusters, random_state=41)
clusters = kmeans.fit_predict(query_embeddings)

# Print the cluster assignments
for query, cluster in zip(queries, clusters):
    print(f"Query: {query.strip()}\nCluster: {cluster}\n")

# from sklearn.cluster import KMeans

# # Number of clusters you want to create
# num_clusters = 2  # You can adjust this value

# # Perform K-Means clustering
# kmeans = KMeans(n_clusters=num_clusters, random_state=41)
# clusters = kmeans.fit_predict(tfidf_matrix)

# # Print the cluster assignments
# for query, cluster in zip(queries, clusters):
#     print(f"Query: {query.strip()}\nCluster: {cluster}\n")

In [7]:
pca = PCA(n_components=5)  # Set it to the number of queries or less
pca_result = pca.fit_transform(query_embeddings)

# Perform t-SNE dimensionality reduction on PCA results
tsne = TSNE(n_components=2, perplexity=50, n_iter=500)
tsne_result = tsne.fit_transform(pca_result)



In [8]:
import plotly.express as px
import plotly.io as pio
import pandas as pd

# Convert cluster numbers to strings
cluster_labels = [str(cluster) for cluster in clusters]

# Create a DataFrame with the data for Plotly
data = {

    'x': tsne_result[:, 0].tolist(),

    'y': tsne_result[:, 1].tolist(),

    'cluster': cluster_labels,

    'query': queries

}
df = pd.DataFrame(data)

# Create the scatter plot using Plotly and set custom colors
fig = px.scatter(df, x='x', y='y', hover_name='query', color="cluster",labels={"1": "In-Topic", "0": "Off-Topic"})

# Show the plot
fig.update_layout(title='t-SNE Visualization of Clusters (with Word2Vec and PCA + t-SNE)',

                  xaxis_title='t-SNE_Dim1', yaxis_title='t-SNE_Dim2')
fig.show()

