# Imports and Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np

# Text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# Optional: for visual confirmation
pd.set_option("display.max_colwidth", 200)

# Load Preprocessed CSV

In [7]:
df = pd.read_csv("../data/tickets_preprocessed.csv")
df.set_index("ticket_id", inplace=True)

# Confirm structure and content
print("Shape:", df.shape)
df[["department","processed_text"]].head(10)

Shape: (60, 6)


Unnamed: 0_level_0,department,processed_text
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,IT,unable connect vpn home
2,IT,outlook crash every time open email attachment
3,IT,laptop battery last minute full charge
4,IT,team pick microphone even though work apps
5,IT,wifi disconnect randomly throughout day restart
6,IT,blue screen appear presentation system reboot
7,IT,keyboard key stick occasionally register
8,IT,instal late window update mouse lag badly
9,IT,print laptop office printer
10,IT,system extremely slow run multiple browser tab


# TF-IDF Vectorization

In [8]:
vectorizer = TfidfVectorizer(
    max_df=0.95,       # Ignore very common words
    min_df=2,          # Ignore very rare words
    stop_words='english',  # Just in case
)

# Fit and transform
tfidf_matrix = vectorizer.fit_transform(df["processed_text"])

# Normalize the matrix for clustering (optional but often helpful)
tfidf_matrix_norm = normalize(tfidf_matrix)

# Confirm shape and feature count
print("TF-IDF matrix shape:", tfidf_matrix_norm.shape)


TF-IDF matrix shape: (60, 54)


## Inspect top terms by average TF-IDF score

In [10]:
# Get feature names and compute average TF-IDF per term
feature_names = vectorizer.get_feature_names_out()
avg_tfidf_scores = tfidf_matrix.mean(axis=0).A1  # Convert to 1D array

# Create DataFrame of terms and scores
tfidf_df = pd.DataFrame({
    "term": feature_names,
    "avg_tfidf": avg_tfidf_scores
}).sort_values(by="avg_tfidf", ascending=False)

# Show top 20 terms
tfidf_df.head(10)


Unnamed: 0,term,avg_tfidf
31,rack,0.052636
42,server,0.047532
45,team,0.044672
19,laptop,0.044476
0,access,0.04321
4,battery,0.038362
2,aisle,0.036703
15,fail,0.036218
49,unable,0.036172
30,power,0.03582


# Dimensionality Reduction Using Truncated SVD

In [11]:
from sklearn.decomposition import TruncatedSVD

# Reduce to 50 dimensions (you can tune this)
n_components = 50
svd = TruncatedSVD(n_components=n_components, random_state=42)

# Fit and transform
X_reduced = svd.fit_transform(tfidf_matrix_norm)

# Confirm shape
print("Reduced shape:", X_reduced.shape)

Reduced shape: (60, 50)


# Save Outputs for Clustering

In [13]:
import joblib

# Save reduced matrix as a CSV with ticket IDs
reduced_df = pd.DataFrame(X_reduced, index=df.index)
reduced_df.to_csv("../data/tfidf_reduced_50d.csv")

# Save vectorizer and SVD model
joblib.dump(vectorizer, "../data/tfidf_vectorizer.joblib")
joblib.dump(svd, "../data/truncated_svd.joblib")

print("Saved: tfidf_reduced_50d.csv, tfidf_vectorizer.joblib, truncated_svd.joblib")

Saved: tfidf_reduced_50d.csv, tfidf_vectorizer.joblib, truncated_svd.joblib
