In [1]:
import os
import re
import pandas as pd
import numpy as np
import spacy
from spacy.matcher import Matcher
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report



In [38]:
# Load Spacy model for English language
nlp = spacy.load('en_core_web_sm')

# Define a function to extract skills from resume text


def extract_skills(resume_text):
    skills = []
    doc = nlp(resume_text.lower())
    matcher = Matcher(nlp.vocab)
    pattern = [{'POS': 'NOUN', 'OP': '+'}, {'POS': 'VERB', 'OP': '?'}, {'POS': 'ADJ', 'OP': '*'}]
    matcher.add('Skills', [pattern])
    matches = matcher(doc)
    for match_id, start, end in matches:
        skill = doc[start:end].text
        if len(skill) > 1 and not skill.isnumeric():
            skills.append(skill)
    return list(set(skills))

In [30]:
# Define a function to extract experience from resume text
def extract_experience(resume_text):
    doc = nlp(resume_text.lower())
    experience = ''
    for sent in doc.sents:
        if re.search(r"\b(?:experience|work|position|employment)\b", sent.text):
            experience += ' ' + sent.text
    return experience.strip()

In [31]:
# Define a function to preprocess resume text
def preprocess_resume(resume_text):
    doc = nlp(resume_text.lower())
    preprocessed_text = ''
    for token in doc:
        if token.pos_ == 'NOUN' and token.dep_ == 'compound':
            preprocessed_text += token.text + ' '
        elif token.pos_ == 'NOUN' and token.dep_ == 'amod':
            preprocessed_text += token.text + ' '
        elif token.pos_ == 'NOUN' and token.dep_ == 'nsubj':
            preprocessed_text += token.text + ' '
        elif token.pos_ == 'ADJ' and token.dep_ == 'amod':
            preprocessed_text += token.text + ' '
        elif token.pos_ == 'VERB' and token.dep_ == 'advmod':
            preprocessed_text += token.text + ' '
    return preprocessed_text.strip()

In [32]:
# Load resume dataset
dataset_path = '/content/UpdatedResumeDataSet.csv'
if not os.path.exists(dataset_path):
    !wget https://github.com/amanneox/csv-files/raw/master/resume_dataset.csv
df = pd.read_csv(dataset_path)
df = df.dropna(subset=['Category'])
df = df[['Resume', 'Category']]
df = df.rename(columns={'Resume': 'text', 'Category': 'domain'})

# Preprocess resume text
df['text'] = df['text'].apply(preprocess_resume)

In [40]:


# Extract skills and experience from resume text

df['skills'] = df['text'].apply(extract_skills)
df['experience'] = df['text'].apply(extract_experience)

# Split dataset into training and testing sets
X = df[['text', 'skills', 'experience']]
y = df['domain']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
text_clf = Pipeline([
('vect', CountVectorizer(ngram_range=(1, 2))),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [42]:
#Train the text classification model
text_clf.fit(X_train['text'], y_train)

#Predict the domain of resume text
y_pred = text_clf.predict(X_test['text'])
print(classification_report(y_test, y_pred))

                           precision    recall  f1-score   support

                 Advocate       0.00      0.00      0.00         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      1.00      1.00        12
                   Hadoop       1.00      1.00      1.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
#Extract features from skills and experience columns
cv_skills = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, lowercase=False)
cv_exp = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, lowercase=False)
X_train_skills = cv_skills.fit_transform(X_train['skills'])
X_test_skills = cv_skills.transform(X_test['skills'])
X_train_exp = cv_exp.fit_transform(X_train['experience'])
X_test_exp = cv_exp.transform(X_test['experience'])





In [45]:
#Combine all features
X_train_combined = np.hstack((X_train_skills.toarray(), X_train_exp.toarray()))
X_test_combined = np.hstack((X_test_skills.toarray(), X_test_exp.toarray()))

In [46]:
#Define pipeline for skills and experience classification
skill_exp_clf = Pipeline([
('clf', MultinomialNB()),
])

#Train the skills and experience classification model
skill_exp_clf.fit(X_train_combined, y_train)

#Predict the domain of skills and experience
y_pred_skills_exp = skill_exp_clf.predict(X_test_combined)
print(classification_report(y_test, y_pred_skills_exp))

                           precision    recall  f1-score   support

                 Advocate       1.00      0.33      0.50         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      0.75      0.86         4
           Civil Engineer       1.00      0.22      0.36         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      0.62      0.77         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      0.60      0.75         5
            ETL Developer       1.00      0.71      0.83         7
   Electrical Engineering       1.00      0.67      0.80         6
                       HR       1.00      1.00      1.00        12
                   Hadoop       1.00      1.00      1.00     

In [49]:
#Predict the domain of a new resume
new_resume_text = 'Experienced software engineer with a passion for developing innovative software solutions. Skilled in Java, Python, and C++.'
new_resume_skills = extract_skills(new_resume_text)
new_resume_exp = extract_experience(new_resume_text)
new_resume_skills_vec = cv_skills.transform([new_resume_skills])
new_resume_exp_vec = cv_exp.transform([new_resume_exp])
new_resume_combined_vec = np.hstack((new_resume_skills_vec.toarray(), new_resume_exp_vec.toarray()))
new_resume_domain = skill_exp_clf.predict(new_resume_combined_vec)
print('Predicted domain:', new_resume_domain[0])

Predicted domain: Testing
