# Imports and Setup

In [14]:
%load_ext autoreload
%autoreload 2

In [15]:
import pandas as pd
import numpy as np

# Text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# Optional: for visual confirmation
pd.set_option("display.max_colwidth", 200)

# Load Preprocessed CSV

In [16]:
df = pd.read_csv("../data/tickets_preprocessed.csv")
df.set_index("ticket_id", inplace=True)

# Confirm structure and content
print("Shape:", df.shape)
df[["department","processed_text"]].head(10)

Shape: (100, 6)


Unnamed: 0_level_0,department,processed_text
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,IT,vpn disconnect randomly video call start last system update
2,IT,laptop battery dy within minute even full charge
3,IT,outlook keep freeze try search email
4,IT,wifi drop every time join team meeting
5,IT,laptop connect company network try restart update driver
6,IT,team notification show desktop hour later
7,IT,unable install zoom due admin restriction
8,IT,external keyboard stop work plug dock station
9,IT,outlook calendar event miss sync mobile
10,IT,system extremely slow last antivirus update


# TF-IDF Vectorization

In [17]:
vectorizer = TfidfVectorizer(
    max_df=0.95,       # Ignore very common words
    min_df=2,          # Ignore very rare words
    stop_words='english',  # Just in case
)

# Fit and transform
tfidf_matrix = vectorizer.fit_transform(df["processed_text"])

# Normalize the matrix for clustering (optional but often helpful)
tfidf_matrix_norm = normalize(tfidf_matrix)

# Confirm shape and feature count
print("TF-IDF matrix shape:", tfidf_matrix_norm.shape)


TF-IDF matrix shape: (100, 115)


## Inspect top terms by average TF-IDF score

In [18]:
# Get feature names and compute average TF-IDF per term
feature_names = vectorizer.get_feature_names_out()
avg_tfidf_scores = tfidf_matrix.mean(axis=0).A1  # Convert to 1D array

# Create DataFrame of terms and scores
tfidf_df = pd.DataFrame({
    "term": feature_names,
    "avg_tfidf": avg_tfidf_scores
}).sort_values(by="avg_tfidf", ascending=False)

# Show top 20 terms
tfidf_df.head(10)


Unnamed: 0,term,avg_tfidf
90,server,0.038122
74,rack,0.036861
106,update,0.03616
0,access,0.035362
31,fail,0.032622
27,drop,0.030104
95,switch,0.0295
98,team,0.029493
112,work,0.028439
64,outlook,0.02789


# Dimensionality Reduction Using Truncated SVD

In [19]:
from sklearn.decomposition import TruncatedSVD

# Reduce to 50 dimensions (you can tune this)
n_components = 50
svd = TruncatedSVD(n_components=n_components, random_state=42)

# Fit and transform
X_reduced = svd.fit_transform(tfidf_matrix_norm)

# Confirm shape
print("Reduced shape:", X_reduced.shape)

Reduced shape: (100, 50)


# Save Outputs for Clustering

In [20]:
import joblib

# Save reduced matrix as a CSV with ticket IDs
reduced_df = pd.DataFrame(X_reduced, index=df.index)
reduced_df.to_csv("../data/tfidf_reduced_50d.csv")

# Save vectorizer and SVD model
joblib.dump(vectorizer, "../data/tfidf_vectorizer.joblib")
joblib.dump(svd, "../data/truncated_svd.joblib")

print("Saved: tfidf_reduced_50d.csv, tfidf_vectorizer.joblib, truncated_svd.joblib")

Saved: tfidf_reduced_50d.csv, tfidf_vectorizer.joblib, truncated_svd.joblib
