In [1]:
# Step 1: Import Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load Dataset
df = pd.read_csv("/Users/macbook/Documents/NewMLTwo/Skill Gap/UpdatedResumeDataSet.csv")  # path to your dataset

# Step 3: Preprocess Resume Text
def clean_resume(text):
    text = re.sub('http[s]?://\S+', '', text)  # remove URLs
    text = re.sub('\s+', ' ', text)            # normalize spaces
    text = re.sub('[^A-Za-z0-9 ]+', '', text)  # remove special characters
    return text.lower().strip()

df['Cleaned_Resume'] = df['Resume'].apply(clean_resume)

# Step 4: Vectorize Text using TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['Cleaned_Resume'])

# Step 5: Prepare Target Labels
y = df['Category']

# Step 6: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 8: Evaluate Model
y_pred = model.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  text = re.sub('http[s]?://\S+', '', text)  # remove URLs
  text = re.sub('\s+', ' ', text)            # normalize spaces


Accuracy Score: 0.9948186528497409

Classification Report:
                            precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      1.00      1.00        12
 

In [3]:
import pickle

# Save model
with open("models/resume_classifier_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save TF-IDF vectorizer
with open("models/tfidf_vectorizer.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)
