# 1. Installing the required libraries

In [None]:
# Installing the libraries for project
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install spacy-transformers
!python -m spacy download en_core_web_trf
!!pip install tika


# 2. Loading the important libraries and connecting to google drive

In [None]:
# Connecting to google drive
#Taken help from https://stackoverflow.com/questions/71695387/connecting-to-a-different-google-drive-than-the-one-logged-into-google-colab/71696254#71696254
!sudo add-apt-repository ppa:alessandro-strada/ppa
!sudo apt update && sudo apt install google-drive-ocamlfuse
!google-drive-ocamlfuse

!sudo apt-get install w3m # to act as web browser 
!xdg-settings set default-web-browser w3m.desktop # to set default browser


%cd /content
!mkdir gdrive
%cd gdrive
!mkdir "My Drive"
!google-drive-ocamlfuse "/content/gdrive/My Drive"

In [None]:
import os 
import json
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import spacy 
from spacy.tokens import DocBin
from tqdm import tqdm
import re
from tika import parser
import warnings
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training.example import Example

In [None]:
# Changing the base directory to correct folder
os.chdir('/content/gdrive/My Drive/appliedai/assignment/case_study_2')

# 3. Main Functions

In [None]:
def load_resumes_files_folder():
  '''This function loads the resumes folder which contains lots of resumes from which we can select any one at random
  '''
  sub_directory_for_resume = 'naukri_resumes'
  files_list = os.listdir('naukri_resumes')
  return sub_directory_for_resume,files_list


In [None]:
def pdf_to_text(filepath):
  '''Extracts the text data from the pdf'''
  raw = parser.from_file(filepath)
  text = raw['content']
  return text


In [None]:
def preprocess(text):
  text = "".join([s for s in text.splitlines(True) if s.strip("\r\n")])
  return text

In [None]:
def load_model():
  nlp = spacy.load('generated_model/model-best')
  return nlp

In [None]:
def function_1(file_number):
  '''This function takes the pdf file number from the user and then choose the corresponding resumes 
  from the resume file and then give the result'''
  warnings.filterwarnings("ignore")
  # Loading the resume files folder
  sub_directory_for_resume,files_list = load_resumes_files_folder()
  # Selecting the file which user choose
  complete_file_path = os.path.join(sub_directory_for_resume,files_list[file_number])
  # Extracting the text from the pdf
  text = pdf_to_text(complete_file_path)
  # Preprocessing the text
  text = preprocess(text)
  # Load the model
  nlp = load_model()
  for doc in nlp.pipe([text], disable=["tagger", "parser"]):
    for ent in doc.ents:
        text_name = re.sub('[^A-Za-z0-9]+', ' ', ent.text).strip()
        print((text_name,ent.label_))




In [None]:
function_1(50)

('Gangadhar Kavisetty', 'EDUC')
('1 3', 'DEGREE')
('Glosoft Technologies', 'ORG')
('Bengaluru Bangalore', 'EDUC')
('Secunderabad Bengaluru Bangalore Chennai', 'EDUC')
('B Tech B E Electronics Telecommunication', 'DEGREE')
('SOC Analyst', 'SKILL')
('System Security Engineer', 'SKILL')
('IT Security', 'SKILL')
('Information Security', 'SKILL')
('Cyber Security', 'SKILL')
('7 Mar 22Last Modified 4 Mar 22 Jump to Section', 'EXPERIENCE')
('Gangadhar Kavisetty', 'ORG')
('2 3', 'DEGREE')
('Cyber Security Engineer', 'SKILL')
('3 years of experience in SIEM Vulnerability', 'EXPERIENCE')
('direct and remote analysis', 'SKILL')
('Glosoft Technologies Pvt Ltd as', 'ORG')
('SOC Analyst Nov 2018 to Till Date Monitoring Splunk by fetching the Application System and Security related logs from Endpoint devices like Firewall Switch Router Workstations Managed and Monitored for network security capabilities and solutions', 'EXPERIENCE')
('firewalls', 'TOOL')
('B Tech B E Electronics Telecommunication', '

In [None]:
def load_json_file(file_number):
  '''Loads text data of training data which is in annotated form. Enter file_number between 0 to 50 '''
  spider_tagged_data = 'spider_software_tagged_data'
  list_tagged_files = os.listdir(spider_tagged_data)
  file_path = os.path.join(spider_tagged_data,list_tagged_files[file_number])
  with open(file_path,'r') as f:
    data = json.load(f)
  return data['annotations']




In [None]:
# Taken from https://stackoverflow.com/questions/68213223/how-to-evaluate-trained-spacy-version-3-model#:~:text=nlp%20%3D%20spacy.load(path_to_model)%0Aexamples%20%3D%20%5B%5D%0Ascorer%20%3D%20Scorer()%0Afor%20text%2C%20annotations%20in%20TEST_REVISION_DATA%3A%0A%20%20%20%20doc%20%3D%20nlp.make_doc(text)%0A%20%20%20%20example%20%3D%20Example.from_dict(doc%2C%20annotations)%0A%20%20%20%20example.predicted%20%3D%20nlp(str(example.predicted))%0A%20%20%20%20examples.append(example)%0Ascorer.score(examples)
def score_metrics(nlp,data):
  examples = []
  scorer = Scorer()
  for text, annotations in data:
      doc = nlp.make_doc(text)
      example = Example.from_dict(doc, annotations)
      example.predicted = nlp(str(example.predicted))
      examples.append(example)
  result_metrics = scorer.score(examples)
  return result_metrics['ents_per_type']

In [None]:
def function_2(file_number):
  '''In this function it takes file number as input and correspondingly choose required annotated file from the folder.And after that it can 
  show the output of precision,recall,f1_score
  '''
  data = load_json_file(file_number)
  print('The data looks like:')
  for a in data:
    print(a)
  nlp = load_model()
  result_net = score_metrics(nlp,data)
  print('='*100)
  print('The result metrics are down below')
  print('-'*100)
  for i,j in result_net.items():
    print(i,' ',j)





In [None]:
# Here you can choose any number between 0 to 50 including both of them
function_2(43)

The data looks like:
['Program Manager Resume', {'entities': [[0, 15, 'JOB_TITLE']]}]
['Job Level Management Manager Director ', {'entities': [[21, 37, 'JOB_TITLE']]}]
['Highest Degree Attained Bachelors', {'entities': [[24, 33, 'DEGREE']]}]
['Objective Strategic and results driven professional offering more than 12 years of practical experience in Software Industry focused on technical program management and product quality Accredited as a contributor to the development of all Norton major product releases Resourceful leader skilled in streamlining operation and maintaining schedule to ensure maximum results in business revenue Ability to manage multiple concurrent high value projects while maintaining technical savvy strategic thinking and tactical execution Practical understanding of managing full life cycle large scale modern production software programs High proficiency in planning collaboration communication with ability to drive business operations and processes efficiently and 

Now value of p,r,f are high because we have used very less training data as data is not available online directly so i manually annoated data and i only used 50 file for data annotation.