In [5]:
pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.




In [13]:
import pandas as pd
import json
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from nltk.stem import WordNetLemmatizer
import joblib
from fuzzywuzzy import fuzz
from collections import defaultdict

# Load dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

text_field = "role_description"

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# assign job domains based on skills
def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

# domains based on job descriptions
data["domain"] = data[text_field].apply(assign_domain)

# text cleaning
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

# Drop rows with empty cleaned_text
data = data[data["cleaned_text"].str.strip() != ""]

# features and target
X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

# Best Random Forest model
best_rf_classifier = grid_search_rf.best_estimator_

# Train other classifiers without hyperparameter tuning
gbm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

# predictions
rf_predictions = best_rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)
# predictions (continued)
svm_predictions = svm_classifier.predict(X_test)

# model performance
rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

# Save the trained models and vectorizer
joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the models and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# gbm_classifier = joblib.load('gbm_classifier.joblib')
# svm_classifier = joblib.load('svm_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')

# Predict domains for the original dataset
data["predicted_domain"] = best_rf_classifier.predict(vectorizer.transform(data["cleaned_text"]))

# Save the dataset with predictions to a new CSV file
data.to_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv", index=False)


import json
import pandas as pd
import os
import glob
from fuzzywuzzy import fuzz
from collections import defaultdict

# Function to process each resume JSON and flatten the data
# Function to process each resume JSON and flatten the data
def process_resume(resume_data):
    flattened_data = {
        "Name": resume_data.get("Name", ""),
        "Email": resume_data.get("Contact", {}).get("Email", "").lower(),
        "Phone": resume_data.get("Contact", {}).get("Phone", ""),
        "LinkedIn": resume_data.get("Contact", {}).get("LinkedIn", ""),
        "Github": resume_data.get("Contact", {}).get("GitHub", ""),
        "Degree": resume_data["Education"].get("Degree", ""),
        "Major": resume_data["Education"].get("Major", ""),
        "Year": resume_data["Education"].get("Graduation Year", ""),
        "CGPA": resume_data["Education"].get("CGPA", ""),
    }

    # Combine all experiences into one cell
    experiences = [{"role": exp.get("Role", ""), "company": exp.get("Company", ""), "duration": exp.get("Duration", ""), "description": exp.get("Description", "")} for exp in resume_data.get("UserExperience", [])]
    flattened_data["Experiences"] = json.dumps(experiences)

    # Combine all projects into one cell
    projects = [{"title": project.get("Title", ""), "duration": project.get("Duration", ""), "description": project.get("Description", "")} for project in resume_data.get("Projects", [])]
    flattened_data["Projects"] = json.dumps(projects)

    # Combine all achievements into one cell
    achievements = [{"achievement": achievement} for achievement in resume_data.get("Achievements", [])]
    flattened_data["Achievements"] = json.dumps(achievements)

    # Combine all certifications into one cell if it is a list
    if isinstance(resume_data.get("Certifications"), list):
        certifications = [{"certification": cert} for cert in resume_data.get("Certifications", [])]
    else:
        certifications = []
    flattened_data["Certifications"] = json.dumps(certifications)

    # Combine all hard skills into one cell
    hard_skills = [{"skill": skill, "percentage": percentage} for skill, percentage in resume_data.get("Skills", {}).get("Hard Skills", {}).items()]
    flattened_data["HardSkills"] = json.dumps(hard_skills)

    # Combine all soft skills into one cell
    soft_skills = [{"skill": skill, "percentage": percentage} for skill, percentage in resume_data.get("Skills", {}).get("Soft Skills", {}).items()]
    flattened_data["SoftSkills"] = json.dumps(soft_skills)

    # Combine all recommended job domains into one cell
    recommended_job_domains = [{"job_domain": job_domain} for job_domain in resume_data.get("Recommended_Job_Domains", [])]
    flattened_data["RecommendedJobDomains"] = json.dumps(recommended_job_domains)

    return flattened_data


# Function to read multiple resume JSON files and convert them to a consolidated CSV
def convert_resumes_to_csv(directory_path, output_csv):
    all_resumes = []
    txt_files = glob.glob(os.path.join(directory_path, "*.txt"))

    for file_path in txt_files:
        with open(file_path, 'r') as f:
            try:
                resume_data = json.load(f)
                flattened_data = process_resume(resume_data)
                all_resumes.append(flattened_data)
            except json.JSONDecodeError:
                print(f"Error decoding JSON from file: {file_path}")

    if all_resumes:
        df = pd.DataFrame(all_resumes)
        df.to_csv(output_csv, index=False)
        print(f"Data has been converted to CSV format and saved as '{output_csv}'")
    else:
        print("No valid resume data found.")

# Directory containing the text files with resume JSON data
# Update the path below to point to your specific folder in the Downloads directory
directory_path = "C:/Users/hsahn/OneDrive/Desktop/resume data (json)"  # For Windows
# directory_path = "/Users/YourUsername/Downloads/YourFolderName"  # For macOS
# directory_path = "/home/YourUsername/Downloads/YourFolderName"  # For Linux

# Output CSV file
output_csv = "C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv"

# Convert all resumes to CSV
convert_resumes_to_csv(directory_path, output_csv)


# Load the pre-trained models and vectorizer
best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
gbm_classifier = joblib.load('gbm_classifier.joblib')
svm_classifier = joblib.load('svm_classifier.joblib')
vectorizer = joblib.load('tfidf_vectorizer.joblib')

# Load the dataset with predictions
jobs_df = pd.read_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv")

candidates_df = pd.read_csv("C:/Users/hsahn/OneDrive/Desktop/all_resumes_data.csv")

def parse_job_domains(json_str):
    try:
        json_data = json.loads(json_str)
        if isinstance(json_data, list):
            domains = [item.get('job_domain', 'Other') for item in json_data]
            return domains
    except (json.JSONDecodeError, TypeError):
        return ['Other']

candidates_df['parsed_domains'] = candidates_df['RecommendedJobDomains'].apply(parse_job_domains)

candidates_exploded_df = candidates_df.explode('parsed_domains')

jobs_df['predicted_domain'] = jobs_df['predicted_domain'].str.strip().str.lower()
candidates_exploded_df['parsed_domains'] = candidates_exploded_df['parsed_domains'].str.strip().str.lower()

print("Unique predicted domains in jobs_df:", jobs_df['predicted_domain'].unique())
print("Unique parsed domains in candidates_exploded_df:", candidates_exploded_df['parsed_domains'].unique())

# Matching candidates to jobs based on fuzzy matching of domains
# Matching candidates to jobs based on fuzzy matching of domains
matched_candidates = defaultdict(list)

for job_domain in jobs_df['predicted_domain'].unique():
    for candidate_domain in candidates_exploded_df['parsed_domains'].unique():
        # Convert the domains to strings before calculating the similarity ratio
        job_domain_str = str(job_domain)
        candidate_domain_str = str(candidate_domain)
        
        if fuzz.ratio(job_domain_str, candidate_domain_str) >= 80:
            matched_candidates[job_domain].append(candidate_domain)

# Creating a DataFrame to store matched jobs and candidates
matched_data = []

for job_domain, candidate_domains in matched_candidates.items():
    for candidate_domain in candidate_domains:
        job_matches = jobs_df[jobs_df['predicted_domain'] == job_domain]
        candidate_matches = candidates_exploded_df[candidates_exploded_df['parsed_domains'] == candidate_domain]

        for _, job_row in job_matches.iterrows():
            for _, candidate_row in candidate_matches.iterrows():
                matched_data.append({
                    'company_name': job_row['company_name'],
                    'role_title': job_row['role_title'],
                    'role_description': job_row['role_description'],
                    'predicted_domain': job_row['predicted_domain'],
                    'Name': candidate_row['Name'],
                    'Email': candidate_row['Email'],
                    'Experiences': candidate_row['Experiences'],
                    'parsed_domains': candidate_row['parsed_domains']
                })

# Creating DataFrame from matched data
matched_df = pd.DataFrame(matched_data)

# Saving the matched jobs and candidates to a CSV file
matched_df.to_csv("C:/Users/hsahn/OneDrive/Desktop/matched_jobs_candidates.csv", index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Random Forest Accuracy: 0.9945054945054945
Gradient Boosting Machine Accuracy: 0.978021978021978
Support Vector Machine Accuracy: 0.9945054945054945
Confusion Matrix for Random Forest:
[[20  0  0  0  0  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0 20  0  0]
 [ 0  0  0  0  0  0  0 17  0]
 [ 0  0  0  0  0  0  0  0 20]]
Confusion Matrix for Gradient Boosting Machine:
[[17  0  0  1  2  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0 20  0  0]
 [ 0  0  0  0  0  0  0 17  0]
 [ 0  0  0  0  0  0  0  0 20]]
Confusion Matrix for Support Vector Machine:
[[20  0  0  0  0  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0