In [58]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [59]:
!pip install pandas scikit-learn scikit-multilearn
!pip install joblib
!pip install pyspellchecker
!pip install pdf2docx
!pip install docx2txt



In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from skmultilearn.problem_transform import BinaryRelevance

# Converts the dataset into a dataframe which represents each category by a separate column
def convert_dataset_structure(file_path):
    # Loading the dataset from Google Drive
    df = pd.read_csv(file_path)

    # Splitting the "Categories" column and converting each category to lowercase
    df['categories'] = df['Categories'].apply(lambda x: [category.strip().strip('"').lower() for category in x.split(",")])

    # Removing duplicate categories and sorting the unique categories alphabetically
    all_categories = sorted(set(category for categories_list in df['categories'] for category in categories_list))

    # Creating a new dataframe which represents each category by a separate column
    new_columns = {category: df['categories'].apply(lambda x: 1 if category in x else 0) for category in all_categories}
    new_df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)

    # Removing the 'Categories' column and the 'categories' column
    new_df = new_df.drop(columns=['Categories', 'categories'])

    # Shuffling the rows. "frac=1" means the entire dataframe will be shuffled
    shuffled_df = new_df.sample(frac=1, random_state=42)

    # Replacing the last occurrence of ".csv" in the file path with "New.csv"
    new_file_path = file_path.rsplit('.csv', 1)[0] + "New.csv"

    # Storing the converted dataframe in Google Drive using a new file path
    shuffled_df.to_csv(new_file_path, index=False, encoding='utf-8')

    print(shuffled_df)

    return new_file_path

# Entering the file path to the original dataset
converted_dataset_file_path = convert_dataset_structure('/content/drive/MyDrive/ColabFiles/CategoryDataset1.csv')

                                              Sentence  .net  address  \
229  "IT consultant providing strategic advice and ...     0        0   
73                             "susan.white@email.com"     0        0   
521                                     "Apache Kafka"     0        0   
86                                    "(555) 234-5678"     0        0   
469                                        "Bootstrap"     0        0   
..                                                 ...   ...      ...   
71                                      "Jacob Miller"     0        0   
106                                  "(94) 71-8765432"     0        0   
270  "C# software engineer specializing in developi...     1        0   
435  "Attended a course on Docker and Kubernetes fo...     0        0   
102                                  "(94) 76-9876543"     0        0   

     agile methodologies  ai  algorithms  android  angular  api  apple  ...  \
229                    0   0           0    

In [61]:
# Loading the converted dataset from Google Drive
df = pd.read_csv(converted_dataset_file_path)

print(df.shape)

df.head()

(540, 151)


Unnamed: 0,Sentence,.net,address,agile methodologies,ai,algorithms,android,angular,api,apple,...,user interface,vector graphics editor,version control,virtual reality,volunteer work,vue,web development,web server,work experience,workshops
0,"""IT consultant providing strategic advice and ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,"""susan.white@email.com""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"""Apache Kafka""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"""(555) 234-5678""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"""Bootstrap""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
# Obtaining the 'Sentence' column in the dataset
X = df['Sentence']

# Obtaining the category columns in the dataset
y = df.iloc[:, 1:]

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using TF-IDF to convert text data into numerical vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_X_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

# Trains and evaluates the multi-label text classification model
def train_and_evaluate_model():

    # Calculating class weights to handle the imbalance in the dataset
    class_weights = class_weight.compute_class_weight('balanced', classes=[0, 1], y=y_train.values.flatten())

    # Building a multi-label text classification model using BinaryRelevance with SVM
    classifier = BinaryRelevance(
        classifier=OneVsRestClassifier(SVC(kernel='linear', class_weight={0: class_weights[0], 1: class_weights[1]})),
        require_dense=[False, True]
    )

    # Training the model
    classifier.fit(tfidf_X_train, y_train)

    # Obtaining predictions of the trained model for the testing set
    y_prediction = classifier.predict(tfidf_X_test)

    # Displaying the evaluation results of the trained model
    print(classification_report(y_test, y_prediction, zero_division=1))

    return classifier

classifier = train_and_evaluate_model()



              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7
           2       1.00      1.00      1.00         0
           3       0.00      1.00      0.00         0
           4       1.00      1.00      1.00         0
           5       1.00      1.00      1.00         1
           6       1.00      0.00      0.00         2
           7       1.00      1.00      1.00         1
           8       1.00      0.00      0.00         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         3
          11       1.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         0
          14       1.00      0.00      0.00         2
          15       1.00      0.50      0.67         2
          16       1.00      1.00      1.00         0
          17       1.00    

In [63]:
# Returns the prediction of the multi-label text classification model for a given sentence
def predict_categories(sentence, threshold):
    # Converting the sentence to TF-IDF representation
    tfidf_sentence = tfidf_vectorizer.transform([sentence])

    # Initializing an empty dataframe to store prediction scores
    sentence_prediction_scores = pd.DataFrame(columns=y.columns)

    # Iterating through each label to get prediction scores
    for label in range(y.shape[1]):
        # Getting prediction scores for the sentence
        label_prediction_score = classifier.classifiers_[label].decision_function(tfidf_sentence)

        # Adding the prediction scores to the dataframe
        sentence_prediction_scores[y.columns[label]] = label_prediction_score

    # Calculating prediction scores between 0 and 1
    sentence_prediction_scores = 1 / (1 + 10**(-sentence_prediction_scores))

    # Getting binary predictions for the sentence
    sentence_prediction = (sentence_prediction_scores > threshold).astype(int)

    # Modifying the prediction dataframe for better readability
    sentence_prediction_table = sentence_prediction_scores.melt(var_name='Label', value_name='Prediction Score')

    # Concatenating binary predictions to the dataframe
    sentence_prediction_table['Prediction'] = sentence_prediction.values.flatten()

    # Obtaining the rows with a prediction score higher than the specified threshold
    sentence_prediction_table = sentence_prediction_table[sentence_prediction_table['Prediction Score'] > threshold]

    return sentence_prediction_table

# Getting predictions for a sample sentence
sample_sentence = "Mobile application developer who is holding a degree in Mobile Application Development with a proficiency in Kotlin and Swift."
sample_prediction = predict_categories(sample_sentence, 0.5)

print("Predictions for the sentence:")
print(sample_prediction)

Predictions for the sentence:
                    Label  Prediction Score  Prediction
43                 degree          0.760221           1
51              education          0.884699           1
75                 kotlin          0.671074           1
87                 mobile          0.934070           1
127  software development          0.968874           1
129                 swift          0.505115           1
148       work experience          0.900891           1


In [64]:
import joblib

# Saves the trained model
def save_model(classifier):
    # Saving the trained model to Google Drive
    model_filename = '/content/drive/MyDrive/ColabFiles/multi_label_classifier_model.joblib'
    joblib.dump(classifier, model_filename)

save_model(classifier)

In [65]:
from spellchecker import SpellChecker

# Corrects the spellings of text inputs
def spell_correction(text_input):
    spell = SpellChecker()
    # Removing the leading and trailing whitespaces in the text input
    misspelled = text_input.split()

    corrected_term = ""

    for word in misspelled:
        # Getting the most likely word
        corrected_word = spell.correction(word)

        if corrected_word is None and (word.lower() != "none" and word.lower() != "non"):
            return "cannot recognize word"

        else:
            corrected_term += f"{corrected_word} "

    return corrected_term.strip()

In [66]:
# Areas of interest entered by the user
areas_of_interest = ["work exprience", "degree", "edcation", "vbvjhsfbfhjsbv"]

# Corrects the spellings of the areas of interest entered by the user
def areas_of_interest_correct_spellings(areas_of_interests):
    corrected_areas_of_interest = []
    for area in areas_of_interests:
        corrected_term = spell_correction(area)
        if corrected_term != "cannot recognize word":
            corrected_areas_of_interest.append(corrected_term)
        print(corrected_term)

    return corrected_areas_of_interest

corrected_areas_of_interest = areas_of_interest_correct_spellings(areas_of_interest)

print(corrected_areas_of_interest)

work experience
degree
education
cannot recognize word
['work experience', 'degree', 'education']


In [67]:
from pdf2docx import Converter

# Resume uploaded by the user
uploaded_resume = '/content/drive/MyDrive/ColabFiles/10089434.pdf'

# Converts a '.pdf' file to a '.docx' file
def convert_pdf_to_docx(pdf_path):
    if pdf_path.endswith('.pdf'):
        # Replacing '.pdf' in the pdf path string with '.docx'
        docx_path = pdf_path[:-3] + 'docx'

        # Converting the '.pdf' file to a '.docx' file
        cv = Converter(pdf_path)

        # Storing the converted '.docx' file in the docx path
        cv.convert(docx_path)
        cv.close()

        return docx_path

    print("The file does not contain a '.pdf' extension.")

    return None

uploaded_resume_to_docx = convert_pdf_to_docx(uploaded_resume)



In [68]:
import docx2txt

# Converts a '.docx' file to plain text
def extract_text_from_docx(docx_path):
    if docx_path.endswith('.docx'):
        # Converting the '.docx' file to plain text
        txt = docx2txt.process(docx_path)
        if txt:
            # Replacing the tabs in the text with a space
            tabs_replaced = txt.replace('\t', ' ')

            # Replacing the additional line breaks in the text with a single line break
            line_breaks_replaced = tabs_replaced.replace('\n\n', '\n').replace('\n\n\n', '\n')

            return line_breaks_replaced

    print("The file does not contain a '.docx' extension.")

    return None

resume_text = extract_text_from_docx(uploaded_resume_to_docx)

print(resume_text)

INFORMATION TECHNOLOGY TECHNICIAN I 
Summary
Versatile Systems Administrator possessing superior troubleshooting skills for networking issues, end user problems, and network security. Experienced in server management, systems analysis, and offering in-depth understanding of IT infrastructure areas. Detail-oriented, independent, and focused on taking a systematic approach to solving complex problems. Demonstrated exceptional technical knowledge and skills while working with various teams to achieve shared goals and objectives.
Highlights
Active Directory 
Group Policy Objects 
PowerShell and VBScript Microsoft Exchange 
VMWare experience
New technology and product research Office 365 and Azure 
Storage management 
Enterprise backup management 
Disaster recovery
Experience 
Information Technology Technician I Aug 2007 to Current 
Company Name ï¼​ City , State
Migrating and managing user accounts in Microsoft Office 365 and Exchange Online.
Creating and managing virtual machines for syste

In [69]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Loading the BERT model and the BERT tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Checks the similarity of the category and the area of interest
def check_similarity(category, area_of_interest):
  # Tokenizing the two text inputs
  category_tokens = bert_tokenizer(category, return_tensors='pt')
  area_of_interest_tokens = bert_tokenizer(area_of_interest, return_tensors='pt')

  # Obtaining embeddings for the tokenized text inputs by disabling gradient computation
  with torch.no_grad():
      category_embeddings = bert_model(**category_tokens).last_hidden_state.mean(dim=1).numpy()
      area_of_interest_embeddings = bert_model(**area_of_interest_tokens).last_hidden_state.mean(dim=1).numpy()

  # Calculating the similarity between the two embeddings
  similarity_of_input_text = np.dot(category_embeddings, area_of_interest_embeddings.T) / (np.linalg.norm(category_embeddings) * np.linalg.norm(area_of_interest_embeddings))

  return similarity_of_input_text[0][0]

In [70]:
prediction_threshold = 0.8
similarity_threshold = 0.9
sentence_category_mapping = [] # Sentences in the resume with their predicted categories
identified_categories = [] # Categories found in the resume
category_relevance = [] # Similarity between each category found and each area of interest
relevant_categories = [] # Categoryies which match with at least one area of interest
relevant_sentences = [] # Sentences which belong to categories which match with at least one area of interest

# Predicts the categories of each sentence in the resume
def predict_resume_sentence_categories(sentences):
    for sentence in sentences:
        predictions = predict_categories(sentence, prediction_threshold)
        for index, row in predictions.iterrows():
            label = row['Label']
            prediction_score = row['Prediction Score']
            if prediction_score > prediction_threshold:
                sentence_category_mapping.append([sentence, label, prediction_score])
                if label not in identified_categories:
                    identified_categories.append(label)

# Checks the similarity between the identified categories and the areas of interest
def check_relevance_of_categories(areas_of_interest):
    for category in identified_categories:
        for area_of_interest in areas_of_interest:
            similarity = check_similarity(category, area_of_interest)
            if similarity > similarity_threshold:
                category_relevance.append([category, area_of_interest, similarity])
                if category not in relevant_categories:
                    relevant_categories.append(category)

# Joins the relevant sentences into a paragraph to form the summary
def join_relevant_sentences():
    summary = ""

    for sentence in relevant_sentences:
        # Adding a full stop to all the sentences ending without a full stop
        if sentence.strip().endswith('.'):
            summary += sentence + " "
        else:
            summary += sentence + ". "

    return summary

# Summarizes the uploaded resume based on the entered areas of interest
def summarize_resume(resume_text, areas_of_interest):
    # Splitting the resume text at each line break to form an array of sentences
    sentences = [sentence.strip() for sentence in resume_text.split('\n') if sentence.strip()]

    predict_resume_sentence_categories(sentences)

    check_relevance_of_categories(areas_of_interest)

    # Obtaining the sentences which belong to categories which are relevant to the areas of interest
    for mapping in sentence_category_mapping:
        if mapping[0] not in relevant_sentences and mapping[1] in relevant_categories:
            relevant_sentences.append(mapping[0])

    if not relevant_sentences:
        return "Relevant data could not be found in the resume."

    summary = join_relevant_sentences()

    return summary

result = summarize_resume(resume_text, corrected_areas_of_interest)

print(result)

INFORMATION TECHNOLOGY TECHNICIAN I. Versatile Systems Administrator possessing superior troubleshooting skills for networking issues, end user problems, and network security. Experienced in server management, systems analysis, and offering in-depth understanding of IT infrastructure areas. Detail-oriented, independent, and focused on taking a systematic approach to solving complex problems. Demonstrated exceptional technical knowledge and skills while working with various teams to achieve shared goals and objectives. VMWare experience. New technology and product research Office 365 and Azure. Experience. Information Technology Technician I Aug 2007 to Current. Developing detailed specifications for the acquisition of an Enterprise backup system including systems design, business-case documentation, cost benefit analysis, technical diagrams, and work flow documentation. Debugging and logging of errors in Hyperion and MiamiBiz using Team Foundation Server (TFS). Communicating and defini