# Resume Classification

In [1]:
# Installing tika - a server which is written in JAVA for text extracting ( JAVA must be installed in the device)
#!pip install tika
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Importing all required packages
!pip install tika
!pip install --upgrade scikit-learn
from  sklearn.model_selection  import train_test_split
# Import DecisionTreeClassifier from the correct module
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from tika import parser
import os
import re
import string
import tree

import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')



## Extracting Text

In [3]:
# Function to parse the data from the file using tika package

def extract_text(file_path):
    parsed = parser.from_file(file_path)
    return parsed["content"]

In [4]:
# listing the subfolders available inside the main folder
dataset_path = '/content/drive/MyDrive/resume./'
folder_list = os.listdir(dataset_path)
folder_list

['Peoplesoft Resume', 'React Developer', 'SQL Developer', 'workday']

In [5]:
# To save the extracted data and role in a list
raw_text = []

# Reading through the dataset and adding to the list
for role in folder_list:
    for eachFile in os.listdir(dataset_path+role):
        text = re.sub(r'^\n+', '', extract_text(dataset_path+role+"/"+eachFile), count=1) # Extracting the text for the particular file
         # Creating a Dictionary containing 2 key&value pairs for dataframe creation
        new_row = {'job_role': role, 'text': text}
        raw_text.append(new_row)

In [6]:
# Creating the DataFrame
resume_dataset = pd.DataFrame(data= raw_text, columns=['job_role', 'text'])
resume_dataset

Unnamed: 0,job_role,text
0,Peoplesoft Resume,Rajab B ...
1,Peoplesoft Resume,Rahul Ahuja\n---------------------------------...
2,Peoplesoft Resume,Anubhav Kumar Singh\t\t\nCore Competencies:\...
3,Peoplesoft Resume,Arun Venu\n EXPERIENCE SUMMARY\n\n· Experienc...
4,Peoplesoft Resume,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...
...,...,...
74,workday,\n ...
75,workday,Microsoft Word - ajay del\n\n\n\nJYOTI VERMA\t...
76,workday,Workday HCM Techno f...
77,workday,Chinna Subbarayudu M\nDOB: 06th March 1994\nNa...


In [7]:
#Checking the raw data
print(resume_dataset['text'][4])

 PeopleSoft Admin
VARKALA VIKAS

Career Objective:

I have total 4.2 Years’ Experience in PeopleSoft Admin and PeopleSoft DBA. I hope to enhance my skill set while adding value to the business, to enable implementation of solutions, which aid the company’s objectives, understanding and anticipating the needs, interests and motivations of the clients and to deliver on time, budget and to quality, delivering value through improving agility, quality and reliability

Professional Summary:

· Having 4.2 years of experience in PeopleSoft implementation, Support, People Tools Upgrades, configuration, migrations, maintenance and administration of Application Server Domains, Process Scheduler Servers, Web Server Domains, PUM and Elastic search.
· Involved in various Tools and Application Upgrades.
· Experience in driving Infrastructure Hardware Upgrades, Disaster Recovery Activities.
· Configured https and secure web server (SSL) administration.
· Monitor system by developing and maintaining mo

In [8]:
# Replacing the value Peoplesoft Resume with Peoplesoft
resume_dataset['job_role'] = resume_dataset['job_role'].replace('Peoplesoft Resume', 'Peoplesoft', regex=True)
resume_dataset['job_role'].value_counts()

Unnamed: 0_level_0,count
job_role,Unnamed: 1_level_1
React Developer,24
workday,21
Peoplesoft,20
SQL Developer,14


In [9]:
# Create a job_role count plot
fig = px.histogram(resume_dataset, x="job_role", color=resume_dataset['job_role'])

# Update the layout with labels and title
fig.update_layout(
    xaxis_title="Job Role",
    yaxis_title="No of Resumes",
    title="No of resumes according to Job Role"
)

fig.show()

## EDA

In [10]:
# Function to remove Emojies
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [11]:
# Function to clean the text of special and unwanted characters

def clean_text(text):
    cleaned_text = str(text)
    # Remove email-like addresses
    cleaned_text = re.sub(r'\S+@\S+', '', cleaned_text)
    # Remove links
    cleaned_text = re.sub(r'\S+\.com\S*', '', cleaned_text)
    # Remove URLS
    cleaned_text = re.sub(r'http\S+', '', cleaned_text)
    # Remove Emojis
    cleaned_text = deEmojify(cleaned_text)
    # Remove images
    cleaned_text = re.sub(r'\b\w+\.(png|jpg|jpeg)\b', '', text)
    # Removing the escape characters
    cleaned_text = re.sub(r'\\.', '', cleaned_text)
    # Removing bullets
    cleaned_text = re.sub(r' · ', '', cleaned_text)
    # Remove all the non-alpha symbols
    cleaned_text = re.sub(r'[^a-zA-Z]',' ',cleaned_text)
    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # Removing 2 character word
    cleaned_text = re.sub(r'\b([a-zA-Z])\1\b', "", cleaned_text)
    # Removing single character word
    cleaned_text = re.sub(r'\b[a-zA-Z]\b', "", cleaned_text)
    # Converting to the lowercase
    cleaned_text = cleaned_text.lower()

    return cleaned_text.strip()

In [12]:
# Checking if the clean_text() function performs accurately
test_cleaning = clean_text(resume_dataset['text'][4])
print(test_cleaning)

peoplesoft admin varkala vikas career objective  have total years experience in peoplesoft admin and peoplesoft dba  hope to enhance my skill set while adding value to the business to enable implementation of solutions which aid the company  objectives understanding and anticipating the needs interests and motivations of the clients and to deliver on time budget and to quality delivering value through improving agility quality and reliability professional summary having years of experience in peoplesoft implementation support people tools upgrades configuration migrations maintenance and administration of application server domains process scheduler servers web server domains pum and elastic search involved in various tools and application upgrades experience in driving infrastructure hardware upgrades disaster recovery activities configured https and secure web server ssl administration monitor system by developing and maintaining monitoring shell scripts experience in oracle database

In [13]:
# Applying clean_text() function to the Raw text and saving it to the new column
resume_dataset['Cleaned_data'] = resume_dataset['text'].apply(clean_text)
resume_dataset

Unnamed: 0,job_role,text,Cleaned_data
0,Peoplesoft,Rajab B ...,rajab peoplesoft technical consultant rajab g...
1,Peoplesoft,Rahul Ahuja\n---------------------------------...,rahul ahuja summary years of experience in peo...
2,Peoplesoft,Anubhav Kumar Singh\t\t\nCore Competencies:\...,anubhav kumar singh core competencies scriptin...
3,Peoplesoft,Arun Venu\n EXPERIENCE SUMMARY\n\n· Experienc...,arun venu experience summary experience of yea...
4,Peoplesoft,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...,peoplesoft admin varkala vikas career objectiv...
...,...,...,...
74,workday,\n ...,himaja career objective to work towards achiev...
75,workday,Microsoft Word - ajay del\n\n\n\nJYOTI VERMA\t...,microsoft word ajay del jyoti verma profession...
76,workday,Workday HCM Techno f...,workday hcm techno functional consultant rahul...
77,workday,Chinna Subbarayudu M\nDOB: 06th March 1994\nNa...,chinna subbarayudu dob th march nationality i...


In [14]:
# Count the number of words in tthe text

def count_words_basic(text):
    # Split the text into words using whitespace as delimiter
    words = text.split()
    return len(words)

In [15]:
# Apply the count_words_basic function to the DataFrame
resume_dataset['No_of_words'] = resume_dataset['Cleaned_data'].apply(count_words_basic)
resume_dataset

Unnamed: 0,job_role,text,Cleaned_data,No_of_words
0,Peoplesoft,Rajab B ...,rajab peoplesoft technical consultant rajab g...,340
1,Peoplesoft,Rahul Ahuja\n---------------------------------...,rahul ahuja summary years of experience in peo...,1298
2,Peoplesoft,Anubhav Kumar Singh\t\t\nCore Competencies:\...,anubhav kumar singh core competencies scriptin...,938
3,Peoplesoft,Arun Venu\n EXPERIENCE SUMMARY\n\n· Experienc...,arun venu experience summary experience of yea...,981
4,Peoplesoft,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...,peoplesoft admin varkala vikas career objectiv...,981
...,...,...,...,...
74,workday,\n ...,himaja career objective to work towards achiev...,662
75,workday,Microsoft Word - ajay del\n\n\n\nJYOTI VERMA\t...,microsoft word ajay del jyoti verma profession...,561
76,workday,Workday HCM Techno f...,workday hcm techno functional consultant rahul...,1284
77,workday,Chinna Subbarayudu M\nDOB: 06th March 1994\nNa...,chinna subbarayudu dob th march nationality i...,853


In [16]:
# Load the SpaCy language model
nlp = spacy.load("en_core_web_sm")

# Function to do text normalization
def lemmatization(text):
    # Process the text
    doc = nlp(text)
    # Extract and lemmatize tokens
    lemmatized_tokens = [token.lemma_.strip() for token in doc]
    return ' '.join(lemmatized_tokens)

In [17]:
# Apply the lemmatization function to the DataFrame
resume_dataset['Cleaned_data'] = resume_dataset['Cleaned_data'].apply(lemmatization)

# Apply the count_words_basic function to the DataFrame
resume_dataset['No_of_words'] = resume_dataset['Cleaned_data'].apply(count_words_basic)
resume_dataset

Unnamed: 0,job_role,text,Cleaned_data,No_of_words
0,Peoplesoft,Rajab B ...,rajab peoplesoft technical consultant rajab g...,340
1,Peoplesoft,Rahul Ahuja\n---------------------------------...,rahul ahuja summary year of experience in peop...,1298
2,Peoplesoft,Anubhav Kumar Singh\t\t\nCore Competencies:\...,anubhav kumar singh core competency script she...,939
3,Peoplesoft,Arun Venu\n EXPERIENCE SUMMARY\n\n· Experienc...,arun venu experience summary experience of yea...,981
4,Peoplesoft,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...,peoplesoft admin varkala vikas career objectiv...,981
...,...,...,...,...
74,workday,\n ...,himaja career objective to work towards achiev...,662
75,workday,Microsoft Word - ajay del\n\n\n\nJYOTI VERMA\t...,microsoft word ajay del jyoti verma profession...,561
76,workday,Workday HCM Techno f...,workday hcm techno functional consultant rahul...,1284
77,workday,Chinna Subbarayudu M\nDOB: 06th March 1994\nNa...,chinna subbarayudu dob th march nationality i...,853


In [18]:
# download additional data resources for nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
# Function removes stopwords from the text
def remove_stopwords(text):
    # Define stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) # Tokenize the text
    filtered_text = [word.strip() for word in word_tokens if word.lower() not in stop_words] # Filter out stopwords

    return ' '.join(filtered_text)

In [20]:
# Apply the remove_stopwords function to the DataFrame
resume_dataset['Cleaned_data'] = resume_dataset['Cleaned_data'].apply(remove_stopwords)
resume_dataset

# Apply the count_words_basic function to the DataFrame
resume_dataset['No_of_words'] = resume_dataset['Cleaned_data'].apply(count_words_basic)
resume_dataset

Unnamed: 0,job_role,text,Cleaned_data,No_of_words
0,Peoplesoft,Rajab B ...,rajab peoplesoft technical consultant rajab gm...,262
1,Peoplesoft,Rahul Ahuja\n---------------------------------...,rahul ahuja summary year experience peoplesoft...,994
2,Peoplesoft,Anubhav Kumar Singh\t\t\nCore Competencies:\...,anubhav kumar singh core competency script she...,747
3,Peoplesoft,Arun Venu\n EXPERIENCE SUMMARY\n\n· Experienc...,arun venu experience summary experience year m...,788
4,Peoplesoft,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...,peoplesoft admin varkala vikas career objectiv...,754
...,...,...,...,...
74,workday,\n ...,himaja career objective work towards achieve g...,496
75,workday,Microsoft Word - ajay del\n\n\n\nJYOTI VERMA\t...,microsoft word ajay del jyoti verma profession...,446
76,workday,Workday HCM Techno f...,workday hcm techno functional consultant rahul...,935
77,workday,Chinna Subbarayudu M\nDOB: 06th March 1994\nNa...,chinna subbarayudu dob th march nationality in...,632


In [21]:
# Removing the follwing NER from the dataset

ner_categories =  ['PERSON','GPE','LOC','NORP', 'FAC','PRODUCT','EVENT','WORK_OF_ART','DATE','TIME','LANGUAGE','MONEY']   # https://dataknowsall.com/blog/ner.html
def remove_NER_categories(text):
    doc = nlp(text)
    # Identify named entities and remove names
    cleaned_text = " ".join([token.text for token in doc if not token.ent_type_ in ner_categories])
    return cleaned_text

In [22]:
# Apply the remove_names function to the DataFrame
resume_dataset['Cleaned_data'] = resume_dataset['Cleaned_data'].apply(remove_NER_categories)
resume_dataset

# Apply the count_words_basic function to the DataFrame
resume_dataset['No_of_words'] = resume_dataset['Cleaned_data'].apply(count_words_basic)
resume_dataset

Unnamed: 0,job_role,text,Cleaned_data,No_of_words
0,Peoplesoft,Rajab B ...,rajab peoplesoft technical consultant rajab gm...,259
1,Peoplesoft,Rahul Ahuja\n---------------------------------...,rahul ahuja summary experience peoplesoft tech...,969
2,Peoplesoft,Anubhav Kumar Singh\t\t\nCore Competencies:\...,core competency script shell scripting applica...,734
3,Peoplesoft,Arun Venu\n EXPERIENCE SUMMARY\n\n· Experienc...,arun venu experience summary experience multip...,762
4,Peoplesoft,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...,peoplesoft admin varkala vikas career objectiv...,728
...,...,...,...,...
74,workday,\n ...,himaja career objective work towards achieve g...,481
75,workday,Microsoft Word - ajay del\n\n\n\nJYOTI VERMA\t...,microsoft word ajay professional summary exper...,422
76,workday,Workday HCM Techno f...,workday hcm techno functional consultant rahul...,907
77,workday,Chinna Subbarayudu M\nDOB: 06th March 1994\nNa...,chinna subbarayudu dob th nationality profile ...,598


In [23]:
# function to Use regular expressions to find and remove words with less than 3 characters

def remove_short_words(text):

    cleaned_text = re.sub(r'\b\w{1,2}\b', '', text)

    # Remove any extra spaces created by the removal
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text


In [24]:
# Apply the remove_short_words function to the DataFrame
resume_dataset['Cleaned_data'] = resume_dataset['Cleaned_data'].apply(remove_short_words)
resume_dataset

# Apply the count_words_basic function to the DataFrame
resume_dataset['No_of_words'] = resume_dataset['Cleaned_data'].apply(count_words_basic)
resume_dataset

Unnamed: 0,job_role,text,Cleaned_data,No_of_words
0,Peoplesoft,Rajab B ...,rajab peoplesoft technical consultant rajab gm...,252
1,Peoplesoft,Rahul Ahuja\n---------------------------------...,rahul ahuja summary experience peoplesoft tech...,953
2,Peoplesoft,Anubhav Kumar Singh\t\t\nCore Competencies:\...,core competency script shell scripting applica...,726
3,Peoplesoft,Arun Venu\n EXPERIENCE SUMMARY\n\n· Experienc...,arun venu experience summary experience multip...,729
4,Peoplesoft,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...,peoplesoft admin varkala vikas career objectiv...,719
...,...,...,...,...
74,workday,\n ...,himaja career objective work towards achieve g...,475
75,workday,Microsoft Word - ajay del\n\n\n\nJYOTI VERMA\t...,microsoft word ajay professional summary exper...,415
76,workday,Workday HCM Techno f...,workday hcm techno functional consultant rahul...,883
77,workday,Chinna Subbarayudu M\nDOB: 06th March 1994\nNa...,chinna subbarayudu dob nationality profile sum...,589


In [25]:
# Combine all words into a single list
complete_text = " ".join(text for text in resume_dataset.Cleaned_data)
all_words = complete_text.split(" ")

# Count Word Frequencies
word_freq = Counter(all_words)

# Convert to DataFrame for plotting
word_counts_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)


# Step 5: Create the Plot
fig = px.bar(word_counts_df[0:20], x='Word', y='Frequency', title='Word Frequency Count on entire dataset',
             labels={'Word': 'Word', 'Frequency': 'Frequency'},
             color='Frequency', color_continuous_scale='Viridis')

# Show the plot
fig.show()


In [26]:
# Plotting the word frequency according to the jon role

for each_role in resume_dataset['job_role'].unique().tolist():
    complete_text = " ".join(text for text in resume_dataset.Cleaned_data[resume_dataset['job_role'] == each_role])
    all_words = complete_text.split(" ")
    word_freq = Counter(all_words)
    word_counts_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency']).sort_values(by='Frequency', ascending=False)
    fig = px.bar(word_counts_df[0:20], x='Word', y='Frequency', title='Word Frequency Count on ' + each_role,
                labels={'Word': 'Word', 'Frequency': 'Frequency'},
                color='Frequency', color_continuous_scale='Viridis')
    fig.show()

In [27]:
# Combine all words into a single list
complete_text = " ".join(text for text in resume_dataset.Cleaned_data)

# Generate the word cloud
wordcloud = WordCloud(width=1200, height=600, background_color='white').generate(complete_text)

# Convert the word cloud to a NumPy array
wordcloud_image = wordcloud.to_array()

# Display the word cloud using Plotly
fig = px.imshow(wordcloud_image)
fig.update_layout(title="Word Cloud",
                  xaxis=dict(showgrid=False, showticklabels=False),
                  yaxis=dict(showgrid=False, showticklabels=False))

# Increase the size of the
fig.update_layout(width=1200, height=600)

# Show the plot
fig.show()


In [28]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(resume_dataset['Cleaned_data'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df['job_role'] = resume_dataset['job_role']
# Display the TF-IDF features
tfidf_df

Unnamed: 0,abdul,ability,able,abreast,abs,absence,absent,abstract,abstraction,academic,...,yoga,yogi,yonder,young,yrs,zenefit,zero,zerozilla,zone,job_role
0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,Peoplesoft
1,0.0,0.020903,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,Peoplesoft
2,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,Peoplesoft
3,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,Peoplesoft
4,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,Peoplesoft
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,0.0,0.022644,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,workday
75,0.0,0.022883,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,workday
76,0.0,0.013785,0.0,0.0,0.0,0.125045,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,workday
77,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.049372,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,workday


In [29]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Perform label encoding on the 'Fruit' column
tfidf_df['job_role'] = label_encoder.fit_transform(tfidf_df['job_role'] )

tfidf_df


Unnamed: 0,abdul,ability,able,abreast,abs,absence,absent,abstract,abstraction,academic,...,yoga,yogi,yonder,young,yrs,zenefit,zero,zerozilla,zone,job_role
0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
1,0.0,0.020903,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
2,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
3,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
4,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,0.0,0.022644,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,3
75,0.0,0.022883,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,3
76,0.0,0.013785,0.0,0.0,0.0,0.125045,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,3
77,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.049372,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,3


In [30]:
# Display the mapping of labels
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Label Mapping:", label_mapping)

Label Mapping: {0: 'Peoplesoft', 1: 'React Developer', 2: 'SQL Developer', 3: 'workday'}


In [31]:
print(tfidf_df.columns.tolist())

['abdul', 'ability', 'able', 'abreast', 'abs', 'absence', 'absent', 'abstract', 'abstraction', 'academic', 'accenture', 'accept', 'acceptable', 'acceptance', 'access', 'accessibility', 'accessible', 'accessory', 'acclimatize', 'accommodate', 'accomplish', 'accomplished', 'accord', 'account', 'accountant', 'accounting', 'accuracy', 'achieve', 'achievement', 'acknowledge', 'acknowledgement', 'acquaint', 'acquire', 'acquisition', 'acs', 'act', 'action', 'actionable', 'activate', 'active', 'actively', 'activity', 'adapt', 'adaptability', 'adaptive', 'add', 'additional', 'additionally', 'additive', 'addm', 'address', 'adept', 'adhere', 'adhoc', 'aditya', 'admin', 'adminission', 'administer', 'administrate', 'administration', 'administrative', 'administrator', 'admission', 'ado', 'adobe', 'adopt', 'adp', 'adult', 'advance', 'advanced', 'advancement', 'advantage', 'advice', 'aeroplane', 'aetna', 'affect', 'affigent', 'affiliate', 'affirm', 'aforementione', 'age', 'agent', 'aggregate', 'aggreg

#applying model

In [32]:
!pip install scikit-learn --upgrade
from sklearn.tree import DecisionTreeClassifier, export_text



In [33]:
(X_train, X_test, y_train, y_test) = train_test_split(tfidf_df.drop('job_role', axis=1), tfidf_df['job_role'], test_size=0.2, random_state=42)
DT_model = DecisionTreeClassifier(random_state=42,max_depth=4)
DT_model.fit(X_train, y_train)

In [34]:
DT_model_pred=DT_model.predict(X_test)

In [35]:
#checking accuracy on train & test data
print("the Decision tree model accuracy on traindata",DT_model.score(X_train,y_train))
print("the Decision tree model accuracy on testdata",DT_model.score(X_test,y_test))

the Decision tree model accuracy on traindata 1.0
the Decision tree model accuracy on testdata 0.9375


In [36]:
confusion_matrix(y_test,DT_model_pred)

array([[5, 0, 0, 0],
       [0, 6, 0, 0],
       [0, 1, 1, 0],
       [0, 0, 0, 3]])

In [37]:
print(classification_report(y_test,DT_model_pred , target_names=['peoplesoft','React Developer','Sql Developer','Workday']))

                 precision    recall  f1-score   support

     peoplesoft       1.00      1.00      1.00         5
React Developer       0.86      1.00      0.92         6
  Sql Developer       1.00      0.50      0.67         2
        Workday       1.00      1.00      1.00         3

       accuracy                           0.94        16
      macro avg       0.96      0.88      0.90        16
   weighted avg       0.95      0.94      0.93        16

