In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(data.columns)


Index(['Group', 'Position', 'Row', 'ClassLabel', 'Resume'], dtype='object')


In [3]:
# Load the dataset
input_data_file = 'ResumeDataSet_TrainTest.xlsx'  # Replace with your file path
data = pd.read_excel(input_data_file, sheet_name="ResumeDataSet")

# Inspect the data
print(data.head())

# Check the unique values in the 'Category' column (target column)
print("Unique Class Labels in Data:")
print(data['Category'].unique())

         Group              Position  Row  ClassLabel  \
0  DeveloperIT  Automation Developer    1           0   
1  DeveloperIT  Automation Developer    2           0   
2  DeveloperIT  Automation Developer    3           0   
3  DeveloperIT  Automation Developer    4           0   
4  DeveloperIT  Automation Developer    5           0   

                                              Resume  
0  * Excellent grasping power in learning new con...  
1  SOCIAL SKILLS: Ã¢Â€Â¢ Ability to establish tru...  
2  TECHNICAL SKILLS Automation Testing Ã¢Â€Â¢ Sel...  
3  SKILLS Agile Methodology Scrum, Kanban, Extrem...  
4  Technical Skills Summary I have completed "COR...  
Unique Class Labels in Data:


KeyError: 'Category'

In [None]:
# Function to clean the text
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Apply preprocessing to 'Resume' column (assuming it's the raw text column)
data['CleanDescription'] = data['Resume'].apply(preprocess_text)

# Verify the cleaned text
print(data[['Resume', 'CleanDescription']].head())

In [None]:
# Initialize the LabelEncoder
encoder = LabelEncoder()

# Encode the 'Category' column (target variable) into numerical labels
data['EncodedClassLabel'] = encoder.fit_transform(data['Category'])

# Verify the re-encoded labels
print("Re-encoded Class Labels:")
print(data[['Category', 'EncodedClassLabel']].head())

# Get the list of class names (to map predictions to original labels later)
class_names = encoder.classes_
print("Class Names (Mapped Labels):", class_names)


In [None]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(ngram_range=(2, 4))  # Adjust n-grams as necessary

# Fit the vectorizer on the cleaned text and transform the data
X = tfidf.fit_transform(data['CleanDescription'])
y = data['EncodedClassLabel']  # Using the encoded class labels for training

print(f"Shape of Feature Matrix (X): {X.shape}")

In [None]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Verify the shape of the train and test data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

In [None]:
# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Print the accuracy score
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2f}")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

In [None]:
import pickle

# Save the trained model to a file
with open('resume_classifier_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the class names (reverse the encoding from integers to original categories)
with open('class_names.pkl', 'wb') as f:
    pickle.dump(class_names, f)

print("Model and class names saved!")

In [None]:
# Load the saved model and class names
with open('resume_classifier_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

with open('class_names.pkl', 'rb') as f:
    class_names = pickle.load(f)

# Make prediction on new resume
new_resume = "Example resume content"
new_resume_cleaned = preprocess_text(new_resume)  # Clean the new resume text
new_resume_vectorized = tfidf.transform([new_resume_cleaned])  # Vectorize the cleaned text

# Predict the class label
predicted_class = loaded_model.predict(new_resume_vectorized)
print(f"Predicted Class Label: {class_names[predicted_class[0]]}")
