In [None]:
import pandas as pd
import numpy as np
import joblib
import os

from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [None]:
df = pd.read_csv("../data/Resume.csv")

print(df.head())
print(df.columns)
print("Total samples:", len(df))


In [None]:
df.dropna(inplace=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

texts = df["Resume_str"].astype(str).tolist()
labels = df["Category"].astype(str).tolist()

print("Clean samples:", len(texts))


In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["Resume_str"] = df["Resume_str"].apply(clean_text)


In [None]:
class_counts = df["Category"].value_counts()

valid_classes = class_counts[class_counts >= 20].index
df = df[df["Category"].isin(valid_classes)]

print("Remaining classes:", df["Category"].nunique())
print("Remaining samples:", len(df))


In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

texts = df["Resume_str"].tolist()
labels = df["Category"].tolist()


In [None]:
embedder = SentenceTransformer("all-mpnet-base-v2")

X = embedder.encode(
    texts,
    show_progress_bar=True
)
y = labels


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear"]
}

grid = GridSearchCV(
    SVC(probability=True),
    param_grid,
    cv=5,
    scoring="accuracy"
)

grid.fit(X_train, y_train)

svm_model = grid.best_estimator_

print("Best parameters:", grid.best_params_)


In [None]:
y_pred = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
os.makedirs("../models", exist_ok=True)

joblib.dump(
    svm_model,
    "../models/svm_model.pkl"
)

print("Model saved successfully")
