# Resume Parsing

In [1]:
import fitz  # PyMuPDF
import spacy
from spacy.matcher import PhraseMatcher

1. Using PhraseMatcher

In [2]:
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error opening or reading PDF: {e}")
        return None

def extract_skills_and_points(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    skills = []
    key_points = []

    # Define some common skills to search for
    skill_phrases = [
        "machine learning", "deep learning", "data analysis", "python", "java",
        "project management", "communication", "teamwork", "sql", "excel"
    ]
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp(skill) for skill in skill_phrases]
    matcher.add("SKILLS", patterns)

    # Use matcher to find skills
    matches = matcher(doc)
    for match_id, start, end in matches:
        skills.append(doc[start:end].text)

    # Extract key points
    for token in doc:
        if token.pos_ == 'VERB' and token.dep_ == 'ROOT':
            key_points.append(token.text)

    return skills, key_points

if __name__ == "__main__":
    # Replace with your actual path to the resume PDF file
    pdf_path = '/content/resume.pdf'

    resume_text = extract_text_from_pdf(pdf_path)

    if resume_text:
        skills, key_points = extract_skills_and_points(resume_text)

        print("Skills:")
        for skill in skills:
            print(skill)

        print("\nKey Points:")
        for point in key_points:
            print(point)

Error opening or reading PDF: no such file: '/content/resume.pdf'


2. Using BST

In [3]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_phrase = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, phrase):
        node = self.root
        for char in phrase:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_phrase = True

    def search(self, text):
        node = self.root
        matched_phrases = []
        i = 0
        while i < len(text):
            char = text[i]
            if char in node.children:
                temp_node = node
                temp_phrase = ''
                while i < len(text) and char in temp_node.children:
                    temp_phrase += char
                    temp_node = temp_node.children[char]
                    if temp_node.is_end_of_phrase:
                        matched_phrases.append(temp_phrase)
                    i += 1
                    if i < len(text):
                        char = text[i]
                i -= len(temp_phrase) - 1
            i += 1
        return matched_phrases

In [4]:

# Usage of Trie in the main script
def extract_skills_and_points(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    skills = []
    key_points = []

    # Define some common skills to search for
    skill_phrases = [
        "machine learning", "deep learning", "data analysis", "python", "java",
        "project management", "communication", "teamwork", "sql", "excel"
    ]

    # Build the trie
    trie = Trie()
    for phrase in skill_phrases:
        trie.insert(phrase.lower())

    # Use trie to find skills
    lower_text = text.lower()
    skills = trie.search(lower_text)

    # Extract key points
    for token in doc:
        if token.pos_ == 'VERB' and token.dep_ == 'ROOT':
            key_points.append(token.text)

    return skills, key_points

if __name__ == "__main__":
    # Replace with your actual path to the resume PDF file
    pdf_path = '/content/resume.pdf'

    resume_text = extract_text_from_pdf(pdf_path)

    if resume_text:
        skills, key_points = extract_skills_and_points(resume_text)

        print("Skills:")
        for skill in skills:
            print(skill)

        print("\nKey Points:")
        for point in key_points:
            print(point)


Error opening or reading PDF: no such file: '/content/resume.pdf'


# Re-Sampling

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense




In [2]:
dataset = pd.read_csv("Data/data_after_feature_eng.csv")

In [3]:
dataset["Job Title"].unique().shape[0]

147

In [4]:
X = dataset.drop(columns=["Job Title"])
y = dataset["Job Title"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [8]:
from collections import Counter

In [9]:
print(f"Original dataset shape: {Counter(y_train)}")
print(f"Resampled dataset shape: {Counter(y_train_resampled)}")

Original dataset shape: Counter({'UX/UI Designer': 14642, 'Digital Marketing Specialist': 8310, 'Software Engineer': 8283, 'Network Engineer': 7413, 'Software Tester': 6373, 'Procurement Manager': 6311, 'Executive Assistant': 6291, 'Financial Advisor': 6257, 'Social Media Manager': 5384, 'Sales Representative': 5341, 'Network Administrator': 5322, 'Purchasing Agent': 5309, 'Administrative Assistant': 5254, 'Data Analyst': 5242, 'Procurement Specialist': 5235, 'Systems Administrator': 5230, 'Event Planner': 5203, 'HR Coordinator': 5197, 'Customer Support Specialist': 5157, 'Customer Success Manager': 4267, 'UI Developer': 4256, 'Legal Assistant': 4251, 'Account Manager': 4232, 'Operations Manager': 4228, 'Marketing Manager': 4224, 'Graphic Designer': 4215, 'Content Writer': 4213, 'Landscape Architect': 4206, 'Litigation Attorney': 4206, 'Project Manager': 4185, 'Project Coordinator': 4180, 'Investment Banker': 4167, 'Marketing Analyst': 4159, 'Mechanical Engineer': 4151, 'Architect': 41

In [10]:
X = X_train_resampled.copy() 
y = y_train_resampled.copy()

In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [12]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1937136, 67), (215238, 67), (1937136, 147), (215238, 147))

In [15]:
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [17]:
y_train = np.array(y_train)
y_test = np.array(y_test)
num_classes = len(np.unique(y_train))
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)

#### Job Recommendation system

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense




In [31]:
# Load your dataset
path = "Data/cleaned_df.csv"
dataset = pd.read_csv(path)

# Define features and target for recommendation system
cat_cols = []
num_cols = []

target = ["Job Title"]

In [None]:
class JobRecommendationSystem:
    def __init__(self, data, cat_features, num_features, target_var):
        self.data = data
        self.cat_features = cat_features
        self.num_features = num_features
        self.target_var = target_var
        self.encoder = OneHotEncoder(sparse = False)
        self.scaler = StandardScaler()
        self.job_title_encoder = OneHotEncoder(sparse = False)
        
    def preprocess_data(self):
        # Encode categorical features
        encoded_features = self.encoder.fit_transform(self.data[self.cat_features])
        encoded_df = pd.DataFrame(encoded_features, columns = self.encoder.get_feature_names_out())
        
        # Normalize numerical features
        numerical_features = self.scaler.fit_transform(self.data[self.num_features])
        numerical_df = pd.DataFrame(numerical_features, columns = self.num_features)
        
        # Flatten the skills list and encode
        # The skills column must be list of list
        all_skills = list(set(skill for sublist in self.data["skills"] for skill in sublist))
        skills_df = pd.DataFrame([[skill in sublist for skill in all_skills] for sublist in self.data["skills"]], columns = all_skills)
        
        # Combine all features
        self.X = pd.concat([encoded_df, numerical_df, skills_df], axis = 1)
        
        # Encode the Job Title
        self.y = self.job_title_encoder.fit_transform(self.data[self.target_var])
        
        # Split the data into training and test sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state = 42)

    def build_model(self):
        # Define the neural network model using the Functional API
        input_layer = Input(shape = (self.X_train.shape[1],))
        dense_layer_1 = Dense(64, activation = "relu")(input_layer)
        dense_layer_2 = Dense(32, activation = "relu")(dense_layer_1)
        output_layer = Dense(self.y.shape[1], activation = "softmax")(dense_layer_2)
        
        self.model = Model(inputs = input_layer, outputs = output_layer)
        self.model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])
    
    def train_model(self, epochs = 10, batch_size = 10):
        self.model.fit(self.X_train, self.y_train, epochs = epochs, batch_size = batch_size, validation_split = 0.2)
    
    def evaluate_model(self):
        loss, accuracy = self.model.evaluate(self.X_test, self.y_test)
        print(f"Test Accuracy: {accuracy}")
    
    def make_predictions(self):
        predictions = self.model.predict(self.X_test)
        predicted_classes = self.job_title_encoder.inverse_transform(predictions)
        return predicted_classes

In [2]:
dataset = pd.read_csv("Data/data_after_feature_eng.csv")

In [3]:
dataset["Job Title"].unique().shape[0]

147

In [4]:
dataset.shape

(696012, 68)

In [5]:
X = dataset.drop(columns=["Job Title"])
y = dataset["Job Title"]

In [6]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [7]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [7]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((626410, 67), (69602, 67), (626410,), (69602,))

In [11]:
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [12]:
y_train = np.array(y_train)
y_test = np.array(y_test)
num_classes = len(np.unique(y_train))
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)

ValueError: invalid literal for int() with base 10: 'Network Engineer'

In [19]:
from imblearn.over_sampling import SMOTE

In [18]:
pd.DataFrame(dataset["Job Title"].value_counts())

Unnamed: 0_level_0,count
Job Title,Unnamed: 1_level_1
UX/UI Designer,20907
Digital Marketing Specialist,11970
Software Engineer,11912
Network Engineer,10500
Software Tester,9087
...,...
Procurement Coordinator,1468
Personal Assistant,1461
Financial Planner,1455
Supply Chain Analyst,1439


In [14]:
from tensorflow.keras.layers import Layer

In [15]:
class QuadDense(Layer):

  def __init__(self, units = 32, activation = None):
    super(QuadDense, self).__init__()
    self.units = units
    self.activation = tf.keras.activations.get(activation)

  def build(self, input_shape):
    a_init = tf.random_normal_initializer()
    b_init = tf.random_normal_initializer()
    c_init = tf.zeros_initializer()

    self.a = tf.Variable(initial_value = a_init(shape = (input_shape[-1], self.units),
                                                dtype = "float32"),
                         trainable = True,
                         name = "kernel")

    self.b = tf.Variable(initial_value = b_init(shape = (input_shape[-1], self.units),
                                                dtype = "float32"),
                         trainable = True,
                         name = "kernel")

    self.c = tf.Variable(initial_value = c_init(shape = (self.units, ),
                                                dtype = "float32"),
                         trainable = True,
                         name = "bias")

  def call(self, inputs):
    result = tf.matmul(tf.math.square(inputs), self.a) + tf.matmul(inputs, self.b) + self.c

    return self.activation(result)

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input

class MyModel(Model):
    def __init__(self, input_dim, num_classes):
        super(MyModel, self).__init__()
        self.dense1 = Dense(128, activation='relu')
        self.dense2 = Dense(1024, activation='relu')
        self.dense3 = Dense(1024, activation='relu')
        self.dense4 = Dense(1024, activation='relu')
        self.dense5 = Dense(2056, activation='relu')
        self.dense6 = Dense(2056, activation='relu')
        self.output_layer = Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dense4(x)
        x = self.dense5(x)
        x = self.dense6(x)
        return self.output_layer(x)

# Define the input dimension and number of classes
input_dim = 67  # Example input dimension

# Create an instance of the model
model = MyModel(input_dim = input_dim, num_classes = num_classes)

# Build the model by providing an input shape
model.build((None, input_dim))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()





Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               multiple                  8704      
                                                                 
 dense_1 (Dense)             multiple                  132096    
                                                                 
 dense_2 (Dense)             multiple                  1049600   
                                                                 
 dense_3 (Dense)             multiple                  1049600   
                                                                 
 dense_4 (Dense)             multiple                  2107400   
                                                                 
 dense_5 (Dense)             multiple                  4229192   
                                                                 
 dense_6 (Dense)             multiple                  

In [20]:
model.fit(X_train, y_train, epochs = 10, validation_split=0.2, batch_size = 2048)

Epoch 1/10


KeyboardInterrupt: 

In [23]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Conv1D, Flatten, BatchNormalization, Dropout, Attention, Add
from tensorflow.keras.models import Model

# Define a custom layer QuadDense
class QuadDense(Layer):
    def __init__(self, units=32, activation=None):
        super(QuadDense, self).__init__()
        self.units = units
        self.activation = tf.keras.activations.get(activation)

    def build(self, input_shape):
        a_init = tf.random_normal_initializer()
        b_init = tf.random_normal_initializer()
        c_init = tf.zeros_initializer()

        self.a = self.add_weight(shape=(input_shape[-1], self.units),
                                 initializer=a_init,
                                 trainable=True,
                                 name="a")

        self.b = self.add_weight(shape=(input_shape[-1], self.units),
                                 initializer=b_init,
                                 trainable=True,
                                 name="b")

        self.c = self.add_weight(shape=(self.units,),
                                 initializer=c_init,
                                 trainable=True,
                                 name="c")

    def call(self, inputs):
        result = tf.matmul(tf.math.square(inputs), self.a) + tf.matmul(inputs, self.b) + self.c
        return self.activation(result)

# Define a model that uses various layers for better performance
class MyModel(Model):
    def __init__(self, input_dim, num_classes):
        super(MyModel, self).__init__()
        self.conv1 = Conv1D(64, kernel_size=3, activation='relu', padding='same')
        self.conv2 = Conv1D(128, kernel_size=3, activation='relu', padding='same')
        self.flatten = Flatten()
        self.dense1 = Dense(128, activation='relu')
        self.batch_norm1 = BatchNormalization()
        self.dropout1 = Dropout(0.5)
        self.dense2 = Dense(1024, activation='relu')
        self.batch_norm2 = BatchNormalization()
        self.dropout2 = Dropout(0.5)
        
        # Adding attention layers
        self.attention1 = Attention()
        self.attention2 = Attention()
        
        self.dense3 = QuadDense(1024, activation='relu')
        self.dense4 = QuadDense(1024, activation='relu')
        self.dense5 = Dense(2056, activation='relu')
        self.batch_norm3 = BatchNormalization()
        self.dropout3 = Dropout(0.5)
        self.dense6 = Dense(2056, activation='relu')
        self.output_layer = Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.conv1(tf.expand_dims(inputs, axis=-1))  # Expanding dims to use Conv1D
        x = self.conv2(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.dense2(x)
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        
        # Applying attention
        query = tf.expand_dims(x, axis=1)  # Adding sequence dimension for Attention layer
        attention_output1 = self.attention1([query, query])
        x = Add()([x, tf.squeeze(attention_output1, axis=1)])
        
        attention_output2 = self.attention2([query, query])
        x = Add()([x, tf.squeeze(attention_output2, axis=1)])
        
        x = self.dense3(x)
        x = self.dense4(x)
        x = self.dense5(x)
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense6(x)
        return self.output_layer(x)

# Define the input dimension and number of classes
input_dim = 67  # Example input dimension

# Create an instance of the model
model = MyModel(input_dim=input_dim, num_classes=num_classes)

# Build the model by providing an input shape
model.build((None, input_dim))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

Model: "my_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           multiple                  256       
                                                                 
 conv1d_3 (Conv1D)           multiple                  24704     
                                                                 
 flatten_1 (Flatten)         multiple                  0         
                                                                 
 dense_12 (Dense)            multiple                  1097856   
                                                                 
 batch_normalization_3 (Bat  multiple                  512       
 chNormalization)                                                
                                                                 
 dropout_3 (Dropout)         multiple                  0         
                                                        

In [24]:
model.fit(X_train, y_train, epochs = 10, validation_split=0.2, batch_size = 1024)

Epoch 1/10
Epoch 2/10
 166/1514 [==>...........................] - ETA: 4:07 - loss: 4.9908 - accuracy: 0.0069

KeyboardInterrupt: 

#### Salary Prediction

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import keras

In [None]:
class Linear_Model:
    def __init__(self, input_dim, hidden_layers, output_dim):
        """
        Parameters:
        input_dim (int): Number of the input features
        hidden_layers (list): list of the number of units in the hidden layer
        output_dim: nNumber of the output units 
        """
        self.model = Sequential()
        
        # Input layer
        self.model.add(Dense(hidden_layers[0], input_dim = input_dim, activation = "relu"))
        
        # Hidden layers
        for units in hidden_layers[1:]:
            self.model.add(Dense(units, activation = "relu"))
        
        # Output layer
        self.model.add(Dense(output_dim, activation = "linear"))
        
        # Compile the model
        self.model.compile(optimizer = "adam", loss = "mse", metrics = ["mse"])
    
    def train(self, X_train, y_train, epochs = 10, batch_size = 32):
        self.model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size)
    
    def evaluate(self, X_test, y_test):
        return self.model.evaluate(X_test, y_test)
    
    def predict(self, X):
        return self.model.predict(X)