### Import Libraries

In [1]:
import pandas as pd
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

### Load Dataset

In [2]:
# 20 Low Quality commit messages (vague/short)
low_quality = [
    "fixed stuff", "update code", "bug fix", "misc changes",
    "minor update", "quick change", "changes done", "update readme",
    "fix issues", "edit files", "temporary fix", "tweak code",
    "small change", "refactor", "fix typo", "update docs",
    "minor fix", "patch code", "code cleanup", "adjust files"
]

# 20 High Quality commit messages (descriptive)
high_quality = [
    "add error handling in API", "optimize database queries",
    "improve login validation", "fix typo in README",
    "refactor user model", "update docs for installation",
    "improve UI responsiveness", "resolve crash when user logs out",
    "add unit tests for user service", "fix null pointer in checkout process",
    "implement JWT authentication for users", "update search functionality with caching",
    "add email verification on signup", "fix memory leak in data processing",
    "refactor payment module for efficiency", "update API response formatting",
    "improve password encryption", "fix session timeout issue",
    "add logging for error tracking", "enhance file upload validation"
]

# Combine into one DataFrame
data = {
    "commit_message": low_quality + high_quality,
    "label": [0]*20 + [1]*20  # 0 = Low Quality, 1 = High Quality
}

df = pd.DataFrame(data)
print("\nLabel distribution:\n", df["label"].value_counts())


Label distribution:
 label
0    20
1    20
Name: count, dtype: int64


### Preprocess Text

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text

df["cleaned"] = df["commit_message"].apply(clean_text)

### Split Data

X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned"], df["label"], test_size=0.2, random_state=42
)

### Convert Text to Features (TF-IDF)

In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

### Train Classifier

In [5]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

### Evaluate Model

In [6]:
y_pred = model.predict(X_test_vec)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[3 2]
 [1 2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.50      0.67      0.57         3

    accuracy                           0.62         8
   macro avg       0.62      0.63      0.62         8
weighted avg       0.66      0.62      0.63         8



### PKL file

In [7]:
joblib.dump(model, "commit_classifier.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']