# HR Ticket Topic Analysis using NLP and Clustering
This notebook performs topic analysis on HR tickets using NLP and clustering techniques. It includes steps like preprocessing, vectorization, clustering, and topic modeling.

In [None]:

# ==========================
# 1. Import Dependencies
# ==========================

import pandas as pd
import numpy as np

# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# For Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# For Clustering
from sklearn.cluster import KMeans

# For Topic Modeling
from sklearn.decomposition import LatentDirichletAllocation

# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data if needed
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


In [None]:

# ==========================
# 2. Load and Inspect Data
# ==========================

# Replace 'hr_tickets_large.csv' with the path to your CSV file
df = pd.read_csv('hr_tickets_large.csv')

# Peek at the data
print("Data Sample:")
df.head()


In [None]:

# ==========================
# 3. Preprocess the Text
# ==========================

# Convert text to lowercase
df['text_clean'] = df['Ticket_Text'].str.lower()

# Remove punctuation using Regular Expression Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
df['text_clean'] = df['text_clean'].apply(lambda x: ' '.join(tokenizer.tokenize(x)))

# Remove stopwords and apply lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['text_clean'] = df['text_clean'].apply(preprocess_text)

print("Preprocessed Text Sample:")
df[['Ticket_Text', 'text_clean']].head()


In [None]:

# ==============================================
# 4. Vectorization using TF-IDF
# ==============================================

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, ngram_range=(1,2))
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_clean'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


In [None]:

# ==============================================
# 5. Clustering with K-Means
# ==============================================

num_clusters = 5

kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_model.fit(tfidf_matrix)

df['cluster'] = kmeans_model.labels_

print("Cluster assignment counts:")
print(df['cluster'].value_counts())

df.head()


In [None]:

# ======================================================
# 6. Analyzing Top Terms per Cluster (Optional Insight)
# ======================================================

def get_top_keywords_for_cluster(tfidf_matrix, cluster_labels, vectorizer, top_n=10):
    df_keywords = {}
    for cluster_num in set(cluster_labels):
        cluster_indices = [i for i, c in enumerate(cluster_labels) if c == cluster_num]
        cluster_tfidf = tfidf_matrix[cluster_indices].mean(axis=0)
        cluster_tfidf = np.asarray(cluster_tfidf).flatten()
        top_indices = cluster_tfidf.argsort()[-top_n:][::-1]
        top_features = [vectorizer.get_feature_names_out()[i] for i in top_indices]
        top_scores = [cluster_tfidf[i] for i in top_indices]
        df_keywords[cluster_num] = list(zip(top_features, top_scores))
    return df_keywords

top_keywords = get_top_keywords_for_cluster(tfidf_matrix, df['cluster'], tfidf_vectorizer, top_n=10)

for cluster_num, keywords in top_keywords.items():
    print(f"\nCluster {cluster_num} Top Keywords:")
    for word, score in keywords:
        print(f"{word}: {score:.4f}")


In [None]:

# ==============================================
# 7. Topic Modeling with LDA
# ==============================================

num_topics = 5

lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(tfidf_matrix)

feature_names = tfidf_vectorizer.get_feature_names_out()
n_top_words = 10

for topic_idx, topic in enumerate(lda_model.components_):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features = [feature_names[i] for i in top_features_ind]
    print(f"\nTopic {topic_idx} top words:")
    print(", ".join(top_features))
