# Resume Parsing

In [1]:
import fitz  # PyMuPDF
import spacy
from spacy.matcher import PhraseMatcher

1. Using PhraseMatcher

In [2]:
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error opening or reading PDF: {e}")
        return None

def extract_skills_and_points(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    skills = []
    key_points = []

    # Define some common skills to search for
    skill_phrases = [
        "machine learning", "deep learning", "data analysis", "python", "java",
        "project management", "communication", "teamwork", "sql", "excel"
    ]
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp(skill) for skill in skill_phrases]
    matcher.add("SKILLS", patterns)

    # Use matcher to find skills
    matches = matcher(doc)
    for match_id, start, end in matches:
        skills.append(doc[start:end].text)

    # Extract key points
    for token in doc:
        if token.pos_ == 'VERB' and token.dep_ == 'ROOT':
            key_points.append(token.text)

    return skills, key_points

if __name__ == "__main__":
    # Replace with your actual path to the resume PDF file
    pdf_path = '/content/resume.pdf'

    resume_text = extract_text_from_pdf(pdf_path)

    if resume_text:
        skills, key_points = extract_skills_and_points(resume_text)

        print("Skills:")
        for skill in skills:
            print(skill)

        print("\nKey Points:")
        for point in key_points:
            print(point)

Error opening or reading PDF: no such file: '/content/resume.pdf'


2. Using BST

In [3]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_phrase = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, phrase):
        node = self.root
        for char in phrase:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_phrase = True

    def search(self, text):
        node = self.root
        matched_phrases = []
        i = 0
        while i < len(text):
            char = text[i]
            if char in node.children:
                temp_node = node
                temp_phrase = ''
                while i < len(text) and char in temp_node.children:
                    temp_phrase += char
                    temp_node = temp_node.children[char]
                    if temp_node.is_end_of_phrase:
                        matched_phrases.append(temp_phrase)
                    i += 1
                    if i < len(text):
                        char = text[i]
                i -= len(temp_phrase) - 1
            i += 1
        return matched_phrases

In [4]:

# Usage of Trie in the main script
def extract_skills_and_points(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    skills = []
    key_points = []

    # Define some common skills to search for
    skill_phrases = [
        "machine learning", "deep learning", "data analysis", "python", "java",
        "project management", "communication", "teamwork", "sql", "excel"
    ]

    # Build the trie
    trie = Trie()
    for phrase in skill_phrases:
        trie.insert(phrase.lower())

    # Use trie to find skills
    lower_text = text.lower()
    skills = trie.search(lower_text)

    # Extract key points
    for token in doc:
        if token.pos_ == 'VERB' and token.dep_ == 'ROOT':
            key_points.append(token.text)

    return skills, key_points

if __name__ == "__main__":
    # Replace with your actual path to the resume PDF file
    pdf_path = '/content/resume.pdf'

    resume_text = extract_text_from_pdf(pdf_path)

    if resume_text:
        skills, key_points = extract_skills_and_points(resume_text)

        print("Skills:")
        for skill in skills:
            print(skill)

        print("\nKey Points:")
        for point in key_points:
            print(point)


Error opening or reading PDF: no such file: '/content/resume.pdf'


#### Job Recommendation system

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense




In [31]:
# Load your dataset
path = "Data/cleaned_df.csv"
dataset = pd.read_csv(path)

# Define features and target for recommendation system
cat_cols = []
num_cols = []

target = ["Job Title"]

In [None]:
class JobRecommendationSystem:
    def __init__(self, data, cat_features, num_features, target_var):
        self.data = data
        self.cat_features = cat_features
        self.num_features = num_features
        self.target_var = target_var
        self.encoder = OneHotEncoder(sparse = False)
        self.scaler = StandardScaler()
        self.job_title_encoder = OneHotEncoder(sparse = False)
        
    def preprocess_data(self):
        # Encode categorical features
        encoded_features = self.encoder.fit_transform(self.data[self.cat_features])
        encoded_df = pd.DataFrame(encoded_features, columns = self.encoder.get_feature_names_out())
        
        # Normalize numerical features
        numerical_features = self.scaler.fit_transform(self.data[self.num_features])
        numerical_df = pd.DataFrame(numerical_features, columns = self.num_features)
        
        # Flatten the skills list and encode
        # The skills column must be list of list
        all_skills = list(set(skill for sublist in self.data["skills"] for skill in sublist))
        skills_df = pd.DataFrame([[skill in sublist for skill in all_skills] for sublist in self.data["skills"]], columns = all_skills)
        
        # Combine all features
        self.X = pd.concat([encoded_df, numerical_df, skills_df], axis = 1)
        
        # Encode the Job Title
        self.y = self.job_title_encoder.fit_transform(self.data[self.target_var])
        
        # Split the data into training and test sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state = 42)

    def build_model(self):
        # Define the neural network model using the Functional API
        input_layer = Input(shape = (self.X_train.shape[1],))
        dense_layer_1 = Dense(64, activation = "relu")(input_layer)
        dense_layer_2 = Dense(32, activation = "relu")(dense_layer_1)
        output_layer = Dense(self.y.shape[1], activation = "softmax")(dense_layer_2)
        
        self.model = Model(inputs = input_layer, outputs = output_layer)
        self.model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])
    
    def train_model(self, epochs = 10, batch_size = 10):
        self.model.fit(self.X_train, self.y_train, epochs = epochs, batch_size = batch_size, validation_split = 0.2)
    
    def evaluate_model(self):
        loss, accuracy = self.model.evaluate(self.X_test, self.y_test)
        print(f"Test Accuracy: {accuracy}")
    
    def make_predictions(self):
        predictions = self.model.predict(self.X_test)
        predicted_classes = self.job_title_encoder.inverse_transform(predictions)
        return predicted_classes

#### Salary Prediction

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import keras

In [None]:
class Linear_Model:
    def __init__(self, input_dim, hidden_layers, output_dim):
        """
        Parameters:
        input_dim (int): Number of the input features
        hidden_layers (list): list of the number of units in the hidden layer
        output_dim: nNumber of the output units 
        """
        self.model = Sequential()
        
        # Input layer
        self.model.add(Dense(hidden_layers[0], input_dim = input_dim, activation = "relu"))
        
        # Hidden layers
        for units in hidden_layers[1:]:
            self.model.add(Dense(units, activation = "relu"))
        
        # Output layer
        self.model.add(Dense(output_dim, activation = "linear"))
        
        # Compile the model
        self.model.compile(optimizer = "adam", loss = "mse", metrics = ["mse"])
    
    def train(self, X_train, y_train, epochs = 10, batch_size = 32):
        self.model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size)
    
    def evaluate(self, X_test, y_test):
        return self.model.evaluate(X_test, y_test)
    
    def predict(self, X):
        return self.model.predict(X)