In [None]:
!pip install nltk PyPDF2 pdfplumber imbalanced-learn spacy xgboost
!python -m spacy download en_core_web_sm

import os
import PyPDF2
import pdfplumber
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
import re
import nltk
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

# Download stopwords if needed
nltk.download('stopwords')

# Load spaCy model for NER and text processing
nlp = spacy.load("en_core_web_sm")

# Define stopwords
STOPWORDS = set(stopwords.words('english'))

# Step 1: Extract text from PDF resumes
def extract_text_from_pdf(file_path):
    try:
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        return text
    except:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        return text

# Step 2: Preprocess text (cleaning and tokenization)
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])  # Remove stopwords
    return text

# Step 4: Load and preprocess dataset of resumes from multiple folders
def load_resumes_from_folders(base_folder):
    resumes = []
    labels = []
    
    # Walk through all folders and files
    for root, _, files in os.walk(base_folder):
        job_title = os.path.basename(root)  # Folder name is the job title
        print(f"Processing folder: {root}")  # Check folder path
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                print(f"Processing resume: {file_path}")  # Print file being processed
                resume_text = extract_text_from_pdf(file_path)
                
                if resume_text.strip():  # Ensure resume is not empty
                    resume_cleaned = clean_text(resume_text)
                    resumes.append(resume_cleaned)
                    labels.append(job_title)  # Label based on folder name
    
    print(f"\nTotal Resumes Processed: {len(resumes)}")
    return resumes, labels

# Step 5: Train and save the model using XGBoost
def train_and_save_model(base_folder, model_path, vectorizer_path):
    # Load resumes and their corresponding job titles
    resumes, labels = load_resumes_from_folders(base_folder)
    
    # Split the data into training and test sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(resumes, labels, test_size=0.2, random_state=42)
    
    # Convert resume text to TF-IDF features
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  # Increased features, bi-grams
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    # Use SMOTE to oversample the minority classes
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)
    
    # Define the parameter grid for tuning
    param_grid = {
        'n_estimators': [200, 300, 400],
        'max_depth': [10, 15, 20],
        'learning_rate': [0.1, 0.05, 0.01],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'subsample': [0.7, 0.8, 0.9]
    }

    # Initialize the XGBoost classifier
    xgb = XGBClassifier(objective='multi:softmax', random_state=42)

    # Perform Grid Search
    grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
    grid_search.fit(X_train_smote, y_train_smote)

    # Use the best model from the grid search
    best_xgb = grid_search.best_estimator_

    # Print the best parameters and accuracy
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Accuracy from CV: {grid_search.best_score_:.2f}")

    # Evaluate the best model on the test set
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = best_xgb.predict(X_test_tfidf)

    # Display metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nFinal Test Accuracy: {accuracy * 100:.2f}%")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(12, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=set(y_test), yticklabels=set(y_test))
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    
    # Save the best trained model and vectorizer
    joblib.dump(best_xgb, model_path)
    joblib.dump(vectorizer, vectorizer_path)
    
    print(f"Model saved to {model_path}")
    print(f"Vectorizer saved to {vectorizer_path}")
# Main execution for training and saving the model
if __name__ == "__main__":
    base_folder = "/kaggle/input/resume-dataset/data/data"  # Replace with the path to your dataset
    model_path = "xgb_resume_classifier_model.pkl"
    vectorizer_path = "tfidf_vectorizer.pkl"
    
    # Train and save the model
    train_and_save_model(base_folder, model_path, vectorizer_path)

    print("\nModel training and saving completed.")

