In [1]:
import fitz
import joblib
import random
from sklearn.utils import shuffle 
import os
import PyPDF2
from sklearn.ensemble import RandomForestClassifier
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Define a function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    doc.close()
    return text

# Define a function to extract skills from text
def extract_skills(text, skills_list):
    extracted_skills = []
    for skill in skills_list:
        if skill.lower() in text.lower():
            extracted_skills.append(skill)
    return extracted_skills

# Create a list of job requirements or skills
job_requirements = ["python", "machine learning", "data analysis", "communication"]

# Create a list of resumes and extract text
resume_folder =  r'C:\Users\laptop zone\Downloads\trainResumes-20230819T080132Z-001\trainResumes'
resumes = []
for filename in os.listdir(resume_folder):
    if filename.endswith('.pdf'):
        resume_text = extract_text_from_pdf(os.path.join(resume_folder, filename))
        extracted_skills = extract_skills(resume_text, job_requirements)
        resumes.append({'filename': filename, 'text': resume_text, 'skills': extracted_skills})

# Create a DataFrame from the extracted data
resume_df = pd.DataFrame(resumes)



In [3]:
# Create a list of labels: 1 for the first 45 rows, 0 for the next 45 rows
labels = [1] * 45 + [0] * 45

# Assign the labels to the 'fit_for_job' column
resume_df['fit_for_job'] = labels


# Shuffle the DataFrame to randomize data
resume_df = shuffle(resume_df, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

# Fit and transform the resume text to TF-IDF vectors
X = tfidf_vectorizer.fit_transform(resume_df['text'])

# Define the target variable
y = resume_df['fit_for_job']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

# Ensure that your X_train and y_train are in the correct format (dense arrays)
X_train_dense = X_train.toarray() if hasattr(X_train, 'toarray') else X_train
y_train_dense = y_train.toarray().ravel() if hasattr(y_train, 'toarray') else y_train.ravel()

# Define a wider range of hyperparameters for Random Forest
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Use StratifiedKFold for cross-validation to handle class imbalances
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform Randomized Search Cross-Validation
random_search = RandomizedSearchCV(
    rf_classifier, 
    param_distributions=param_dist, 
    n_iter=100,  # Increase the number of iterations
    cv=cv,       # Use StratifiedKFold for cross-validation
    scoring='accuracy', 
    n_jobs=-1
)
random_search.fit(X_train_dense, y_train_dense)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", random_search.best_params_)
print("Best Accuracy: {:.2f}%".format(random_search.best_score_ * 100))



Best Parameters:  {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
Best Accuracy: 72.48%


In [6]:
# Save the best trained model to a file
best_rf_model = random_search.best_estimator_
joblib.dump(best_rf_model, 'rf_model.pkl')



['rf_model.pkl']