In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('UpdatedResumeDataSet.csv')
df.head()

In [None]:
df.shape

In [None]:
df['Category'].value_counts()

In [None]:
round(df['Category'].value_counts(normalize=True)*100,2)

# Visual Representation

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(data=df, x='Category')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=[17,12])
plt.pie(df['Category'].value_counts(),labels=df['Category'].unique(),autopct='%1.2f%%')
plt.title('Pie Chart for Category')
plt.show()

# Data preprocessing

In [None]:
import re

In [None]:
df['Resume'][0]

Use raw data and bring out clear and valuable texts

In [None]:
def resumeclean(x):
    CleanResume = re.sub('http\S+\s','', x)
    CleanResume = re.sub('@\S+','', CleanResume)
    CleanResume = re.sub('#\S+\s','', CleanResume)
    CleanResume = re.sub('RT|cc', ' ', CleanResume)
    CleanResume = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', CleanResume)
    CleanResume = re.sub(r'[^\x00-\x7f]', ' ', CleanResume) 
    CleanResume = re.sub('\s+', ' ', CleanResume)
    
    return CleanResume

In [None]:
df['Resume'] = df['Resume'].apply(lambda x: resumeclean(x))

In [None]:
df['Resume'][0]

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()
df['Category']=le.fit_transform(df['Category'])

In [None]:
df['Category'].value_counts()

In [None]:
df['Category'].unique()

# Vectorisation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf=TfidfVectorizer(stop_words='english')

vectered_text = tfidf.fit_transform(df['Resume'])

In [None]:
vectered_text[0]

# Training

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest= train_test_split(vectered_text,df['Category'], test_size=0.3, random_state=42)

In [None]:
xtrain.shape

In [None]:
xtest.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
KNN = KNeighborsClassifier()
KNN.fit(xtrain,ytrain)

ypred = KNN.predict(xtest)

In [None]:
print(ypred)

In [None]:
print(accuracy_score(ytest,ypred))

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
import seaborn as sns 
plt.figure(figsize=(12,7))
plt.style.use("fivethirtyeight")

# import libraries for data preprocessinhg
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# libraries for model fitting
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# librarie for metrics 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report


# libraries for optimization and interpretation
import shap

# for warnings
import warnings
warnings.filterwarnings("ignore")
print("All libraries are imported")

In [None]:
clfs={'logreg':LogisticRegression(),
    'KNN':KNeighborsClassifier(),
    'Decision Tree':DecisionTreeClassifier(),
    'Random Forest':RandomForestClassifier(),
    'Adaboost':AdaBoostClassifier(),
    'Gradient Boosting':GradientBoostingClassifier(),
    'XGBoost':XGBClassifier(),
    'OneVsRest':OneVsRestClassifier(KNeighborsClassifier()),
    'SVM':SVC()}

models_report=pd.DataFrame(columns=['Model_name','Accuracy'])

for clf,clf_name in list(zip(clfs.values(),clfs.keys())):
    clf.fit(xtrain,ytrain)
    y_pred=clf.predict(xtest)
    print('fitting classifier....',clf_name)
    t=pd.Series({'Model_name':clf_name,
                 'Accuracy':accuracy_score(ytest,y_pred)})
    models_report=models_report.append(t,ignore_index=True)
    
models_report=models_report.sort_values(by='Accuracy',ascending=False)

In [None]:
models_report

In [None]:
%%time
rf = XGBClassifier()
rf.fit(xtrain,ytrain)
y_pred = rf.predict(xtest)

In [None]:
print(accuracy_score(ytest,y_pred))

# Prediction of my resume

In [None]:
import pickle
pickle.dump(tfidf,open('tfidf','wb'))
pickle.dump(rf, open('rf', 'wb'))

In [None]:
!pip install PyPDF2

In [None]:
import PyPDF2
from tkinter import Tk
from tkinter.filedialog import askopenfilename

def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Create a prompt to select a PDF file
Tk().withdraw()  # Hide the main window
file_path = askopenfilename(title="Select a PDF file", filetypes=[("PDF files", "*.pdf")])

# Read and decode the PDF content
if file_path:
    pdf_content = read_pdf(file_path)
    print("PDF content as a single string:")
    print(pdf_content)
else:
    print("No file selected.")


In [None]:
resume= pdf_content

In [None]:
#Load the model trained

rf = pickle.load(open('rf','rb'))

cleanedresume= resumeclean(resume)

input_features = tfidf.transform([cleanedresume])

prediction_id = rf.predict([input_features][0])

In [None]:
prediction_id

In [None]:

# Map category ID to category name
category_mapping = {
    15: "Java Developer",
    23: "Testing",
    8: "DevOps Engineer",
    20: "Python Developer",
    24: "Web Designing",
    12: "HR",
    13: "Hadoop",
    3: "Blockchain",
    10: "ETL Developer",
    18: "Operations Manager",
    6: "Data Science",
    22: "Sales",
    16: "Mechanical Engineer",
    1: "Arts",
    7: "Database",
    11: "Electrical Engineering",
    14: "Health and fitness",
    19: "PMO",
    4: "Business Analyst",
    9: "DotNet Developer",
    2: "Automation Testing",
    17: "Network Security Engineer",
    21: "SAP Developer",
    5: "Civil Engineer",
    0: "Advocate",
} 

In [None]:
category_name = category_mapping.get(prediction_id[0], "Unknown")

print("Predicted Category:", category_name)

# Creating app in Streamlit


In [None]:
%%writefile resume_app.py
import streamlit as st
import pickle 
import re
import nltk
import PyPDF2
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# libraries for model fitting
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# librarie for metrics 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report


nltk.download('punkt')
nltk.download('stopwords')

# Load models
try:
    with open('rf', 'rb') as rf_file:
        rf = pickle.load(rf_file)
except Exception as e:
    st.error(f"Error loading 'rf' model: {e}")
    st.stop()

try:
    with open('tfidf', 'rb') as tfidf_file:
        tfidfd = pickle.load(tfidf_file)
except Exception as e:
    st.error(f"Error loading 'tfidf' model: {e}")
    st.stop()
    
def resumeclean(x):
    CleanResume = re.sub('http\S+\s', '', x)
    CleanResume = re.sub('@\S+', '', CleanResume)
    CleanResume = re.sub('#\S+\s', '', CleanResume)
    CleanResume = re.sub('RT|cc', ' ', CleanResume)
    CleanResume = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', CleanResume)
    CleanResume = re.sub(r'[^\x00-\x7f]', ' ', CleanResume) 
    CleanResume = re.sub('\s+', ' ', CleanResume)
    
    return CleanResume

def main():
    st.title('Resume Screening App')
    st.markdown(
        "Welcome to the Resume Screening App! Upload your resume to predict the job category it best matches."
    )
    st.image("https://www.recruiterslineup.com/wp-content/uploads/2022/06/resume-screening-software.png", caption="Resume Screening", use_column_width=True)
        
    uploaded_file = st.file_uploader('Upload your resume', type=['txt', 'pdf', 'docx'])

    if uploaded_file:
        st.success("Resume uploaded successfully!")
        
        if uploaded_file.type == "application/pdf":
            pdf_reader = PyPDF2.PdfReader(uploaded_file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
        else:
            text = uploaded_file.getvalue().decode("utf-8")
        
        cleaned_resume = resumeclean(text)
        cleaned_resume = tfidfd.transform([cleaned_resume])
        prediction_id = rf.predict(cleaned_resume)[0]

        category_mapping = {
        15: "Java Developer",
        23: "Testing",
        8: "DevOps Engineer",
        20: "Python Developer",
        24: "Web Designing",
        12: "HR",
        13: "Hadoop",
        3: "Blockchain",
        10: "ETL Developer",
        18: "Operations Manager",
        6: "Data Science",
        22: "Sales",
        16: "Mechanical Engineer",
        1: "Arts",
        7: "Database",
        11: "Electrical Engineering",
        14: "Health and fitness",
        19: "PMO",
        4: "Business Analyst",
        9: "DotNet Developer",
        2: "Automation Testing",
        17: "Network Security Engineer",
        21: "SAP Developer",
        5: "Civil Engineer",
        0: "Advocate"
        }

        category_name = category_mapping.get(prediction_id, "Unknown")

        st.subheader("Prediction Result:")
        st.success(f"The predicted category is: {category_name}")

    else:
        st.warning("Please upload a valid file.")

# Python main
if __name__ == '__main__':
    main()
