In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import nltk

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Karan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Karan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Karan\AppData\Roaming\nltk_data...


In [2]:
df = pd.read_csv("github_issues.csv")

In [3]:
df = df.sample(n=10000)

In [4]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    # Remove special characters and digits
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\d+", "", text)

    # Convert to lowercase
    text = text.lower()

    # Remove stop words and lemmatize
    tokens = nltk.word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(token) for token in tokens if token not in stop_words
    ]
    clean_text = " ".join(tokens)
    return clean_text


df["clean_text"] = df["body"].apply(clean_text)

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["clean_text"])

In [6]:
# Determine optimal number of clusters
scores = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
    score = silhouette_score(X, kmeans.labels_)
    scores.append(score)

optimal_k = np.argmax(scores) + 2  # Add 2 since range starts at 2
print("Optimal number of clusters:", optimal_k)

Optimal number of clusters: 2


In [7]:
# Run K-Means clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42).fit(X)

# Add cluster labels to dataframe
df["cluster_label"] = kmeans.labels_

In [8]:
import pickle

# Save the model
with open("kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)