<a href="https://colab.research.google.com/github/VemPrava/FMML_Projects_and_Labs/blob/main/Pravallika_Module_8_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from os import stat
import typing

import numpy as np

import typing


class BagOfWords:
    """
    A type of encoder, makes
    """

    def __init__(self, data: typing.Iterable) -> None:
        """
        Generate the bag of words
        :param data: an array of words, or an iterable containing arrays of words
        """
        data = np.array(self.__linearize_array(data))
        self.index_to_words = np.unique(data)
        self.words_to_index = {w: i for i, w in enumerate(self.index_to_words)}

    @classmethod
    def __linearize_array(cls, text):
        x = []
        for item in text:
            if isinstance(item, str):
                x.append(item)
            else:
                x.extend(cls.__linearize_array(item))
        return x

    def __call__(self, text: typing.Iterable[str]) -> np.array:
        return self.get_counts(text)

    def __len__(self) -> int:
        return len(self.index_to_words)

    def encode_data(
        self: "BagOfWords",
        text: typing.Union[typing.Iterable[str], typing.Iterable[typing.Iterable[str]]],
    ) -> np.array:
        """
        Compute the encodings of words in a new input tokenized string
        """
        x = []
        for item in text:
            if isinstance(item, str):
                if item in self.words_to_index:
                    x.append(self.words_to_index[item])
            else:
                x.append(self.encode_data(item))
        return x

    def decode_data(self: "BagOfWords", encoded_text: typing.Iterable[int]):
        if isinstance(encoded_text, int) or isinstance(encoded_text, np.int64):
            return self.index_to_words[encoded_text]
        else:
            return list(map(self.decode_data, encoded_text))

    def get_counts(
        self: "BagOfWords",
        text: typing.Union[typing.Iterable[str], typing.Iterable[typing.Iterable[str]]],
    ):
        """
        Computes the counts of words in a new input tokenized string
        """
        if len(text) == 0 or isinstance(text[0], str):
            x = np.zeros(shape=len(self))
            for word in text:
                if word in self.words_to_index:
                    x[self.words_to_index[word]] += 1
            return x
        else:
            return np.stack([self.get_counts(item) for item in text], axis=0)

In [2]:
import numpy as np


class LabelEncoder:
    """
    Label encode a series of labels
    """

    def __init__(self, data) -> None:
        self.__training_data = data
        self.index_to_token = list(set(data))
        self.token_to_index = {
            token: index for index, token in enumerate(self.index_to_token)
        }

    def __len__(self):
        return len(self.token_to_index)

    @property
    def encoded_data(self):
        return np.array([self.token_to_index[token] for token in self.__training_data])

    def encode(self, data):
        return np.array([self.token_to_index[token] for token in data])

    def decode(self, data):
        if isinstance(data, int) or isinstance(data, np.int64):
            return self.index_to_token[data]
        else:
            return np.array([self.index_to_token[index] for index in data])

In [3]:
WORD_LENGTH_THRESHOLD = 2
WORD_COUNT_THRESHOLD = 1

In [4]:
import typing
import numpy as np


class BayesianMulticlassModel:
    """
    A multi-class bayesian classfier from encoded text tokens
    """

    def __init__(self, num_classes, num_tokens) -> None:
        self.counts = np.zeros(shape=(num_classes, num_tokens))

    def fit(self, x_train: typing.Iterable[np.ndarray], y_train: typing.Iterable[int]):
        for x, y in zip(x_train, y_train):
          self.counts[y] += x

    def predict(self, counts_vector):
        class_frequencies = np.sum(self.counts, axis=1)
        word_frequencies = np.sum(self.counts, axis=0)

        prior = class_frequencies / np.sum(class_frequencies)  # p(label)
        likelihood = self.counts / np.expand_dims(
            class_frequencies, axis=1
        )  # p(word|label)
        evidence = word_frequencies / np.sum(word_frequencies)  # p(word)

        likelihood = np.multiply(likelihood, counts_vector)
        prior = np.expand_dims(prior, axis=1)

        posterior_marginal = prior * likelihood / evidence + 0.00001
        posterior_joint = np.sum(np.log(posterior_marginal), axis=1)
        return np.flip(np.argsort(posterior_joint))

In [5]:
!pip install pdfreader


Collecting pdfreader
  Downloading pdfreader-0.1.15-py3-none-any.whl.metadata (4.3 kB)
Collecting bitarray>=1.1.0 (from pdfreader)
  Downloading bitarray-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Collecting pycryptodome>=3.9.9 (from pdfreader)
  Downloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pdfreader-0.1.15-py3-none-any.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitarray-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (303 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.1/303.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m44.5 MB/s[0m eta [36m0

In [6]:
import re
from collections import defaultdict

import numpy as np
import pandas as pd
from pdfreader import PDFDocument, SimplePDFViewer, document

# Define hyperparameters directly instead of importing
WORD_LENGTH_THRESHOLD = 2
WORD_COUNT_THRESHOLD = 1

def clean_text(text: str):
    """
    Given text, it removes all non-character words, small words,
    converts everything to lowercase, tokenizes, and returns as a list.
    :param text: The text to be cleaned
    """
    text = text.lower()
    text = re.sub("[^a-z]", " ", text)
    data = text.split()
    data = list(filter(lambda x: len(x) >= WORD_LENGTH_THRESHOLD, data))
    return data

def parse_pdf(filename: str):
    """
    Read text from a PDF file.
    Clean the text, tokenize it, and return as a list of tokens.
    :param filename: The PDF file path
    """
    fd = open(filename, "rb")
    document = PDFDocument(fd)
    viewer = SimplePDFViewer(fd)
    output_strings = []
    for i in range(len(list(document.pages()))):
        viewer.navigate(1)
        viewer.render()
        output_strings.extend(viewer.canvas.strings)
    file_contents = " ".join(output_strings)
    return clean_text(file_contents)

def parse_resume_df():
    resume_df = pd.read_csv("data/resume-dataset.csv")
    resume_df["Keywords"] = resume_df["Resume"].apply(clean_text)
    return resume_df["Keywords"].values, resume_df["Category"].values


In [7]:
import re
from collections import defaultdict

import numpy as np
import pandas as pd
from pdfreader import PDFDocument, SimplePDFViewer, document




def clean_text(text: str):
    """
    Given text it removes all the non-character words, small words,
    converts everything to small letters, tokenizes and returns as a list.
    :param text: The text to be cleaned
    """
    text = text.lower()
    text = re.sub("[^a-z]", " ", text)
    data = text.split()
    data = list(filter(lambda x: len(x) >= WORD_LENGTH_THRESHOLD, data))
    return data


def parse_pdf(filename: str):
    """
    Read text from a PDF file.
    Clean the text, tokenize it, and return as a list of tokens.
    :param :
    """
    fd = open(filename, "rb")
    document = PDFDocument(fd)
    viewer = SimplePDFViewer(fd)
    output_strings = []
    for i in range(len(list(document.pages()))):
        viewer.navigate(1)
        viewer.render()
        output_strings.extend(viewer.canvas.strings)
    file_contents = " ".join(output_strings)
    return clean_text(file_contents)


def parse_resume_df():
    resume_df = pd.read_csv("data/resume-dataset.csv")
    resume_df["Keywords"] = resume_df["Resume"].apply(clean_text)
    return resume_df["Keywords"].values, resume_df["Category"].values

In [8]:
from google.colab import files
uploaded = files.upload()


Saving model.py to model.py
Saving encoder.py to encoder.py
Saving bow.py to bow.py
Saving hyperparams.py to hyperparams.py


In [9]:
with open("model.py", "r") as f:
    lines = f.readlines()

# Print file content with line numbers to identify issues
for i, line in enumerate(lines):
    print(f"{i+1}: {line}", end="")


1: import typing
2: import numpy as np
3: 
4: 
5: class BayesianMulticlassModel:
6:     """
7:     A multi-class bayesian classfier from encoded text tokens
8:     """
9: 
10:     def __init__(self, num_classes, num_tokens) -> None:
11:         self.counts = np.zeros(shape=(num_classes, num_tokens))
12: 
13:     def fit(self, x_train: typing.Iterable[np.ndarray], y_train: typing.Iterable[int]):
14:         for x, y in zip(x_train, y_train):
15:             self.counts[y] += x
16: 
17:     def predict(self, counts_vector):
18:         class_frequencies = np.sum(self.counts, axis=1)
19:         word_frequencies = np.sum(self.counts, axis=0)
20: 
21:         prior = class_frequencies / np.sum(class_frequencies)  # p(label)
22:         likelihood = self.counts / np.expand_dims(
23:             class_frequencies, axis=1
24:         )  # p(word|label)
25:         evidence = word_frequencies / np.sum(word_frequencies)  # p(word)
26: 
27:         likelihood = np.multiply(likelihood, counts_vec

In [10]:
# Remove extra spaces/tabs at the beginning of each line
fixed_lines = [line.lstrip() for line in lines]

# Overwrite the file with the corrected version
with open("model.py", "w") as f:
    f.writelines(fixed_lines)

print("Indentation fixed!")


Indentation fixed!


In [11]:
with open("model.py", "r") as f:
    lines = f.readlines()

fixed_lines = []
for i, line in enumerate(lines):
    if line.strip().startswith("class ") and (i + 1 < len(lines) and lines[i + 1].strip().startswith('"""')):
        fixed_lines.append(line)
        fixed_lines.append("    pass\n")  # Add indentation fix
    else:
        fixed_lines.append(line)

with open("model.py", "w") as f:
    f.writelines(fixed_lines)

print("Fixed and saved model.py!")


Fixed and saved model.py!


In [13]:
class BayesianMulticlassModel:
    """
    A multi-class bayesian classfier from encoded text tokens
    """
    def __init__(self, num_classes, num_tokens) -> None:
        self.counts = np.zeros(shape=(num_classes, num_tokens)) # This line should be indented


    def fit(self, x_train: typing.Iterable[np.ndarray], y_train: typing.Iterable[int]):
        for x, y in zip(x_train, y_train):
          self.counts[y] += x

    def predict(self, counts_vector):
        class_frequencies = np.sum(self.counts, axis=1)
        word_frequencies = np.sum(self.counts, axis=0)

        prior = class_frequencies / np.sum(class_frequencies)  # p(label)
        likelihood = self.counts / np.expand_dims(
            class_frequencies, axis=1
        )  # p(word|label)
        evidence = word_frequencies / np.sum(word_frequencies)  # p(word)

        likelihood = np.multiply(likelihood, counts_vector)
        prior = np.expand_dims(prior, axis=1)

        posterior_marginal = prior * likelihood / evidence + 0.00001
        posterior_joint = np.sum(np.log(posterior_marginal), axis=1)
        return np.flip(np.argsort(posterior_joint))

In [15]:
class BayesianMulticlassModel:
    """
    A multi-class bayesian classfier from encoded text tokens
    """
    def __init__(self, num_classes, num_tokens) -> None:
        self.counts = np.zeros(shape=(num_classes, num_tokens)) # This line should be indented


    def fit(self, x_train: typing.Iterable[np.ndarray], y_train: typing.Iterable[int]):
        for x, y in zip(x_train, y_train):
            self.counts[y] += x # This line should also be indented by 4 spaces

    def predict(self, counts_vector):
        class_frequencies = np.sum(self.counts, axis=1)
        word_frequencies = np.sum(self.counts, axis=0)

        prior = class_frequencies / np.sum(class_frequencies)  # p(label)
        likelihood = self.counts / np.expand_dims(
            class_frequencies, axis=1
        )  # p(word|label)
        evidence = word_frequencies / np.sum(word_frequencies)  # p(word)

        likelihood = np.multiply(likelihood, counts_vector)
        prior = np.expand_dims(prior, axis=1)

        posterior_marginal = prior * likelihood / evidence + 0.00001
        posterior_joint = np.sum(np.log(posterior_marginal), axis=1)
        return np.flip(np.argsort(posterior_joint))

This project focuses on using Bayesian classification to analyze resumes and predict suitable job categories based on their content. By leveraging natural language processing (NLP) techniques, it extracts key features from resumes using a Bag-of-Words (BoW) model and trains a Naïve Bayes classifier to learn the probabilities of different words appearing in specific job categories. When a new resume is processed, the trained model predicts the most relevant job category based on word frequencies and statistical probabilities. Additionally, an explainer module provides insights into the classification by highlighting the words that influenced the decision. Through this project, we gain a deeper understanding of how Bayesian models can be used for text analysis, particularly in automating resume screening and job categorization, making the hiring process more efficient and data-driven.