<a href="https://colab.research.google.com/github/Sathvik-P/ResumeScanner/blob/main/ResumeScanner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download("popular")

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import string
from wordcloud import WordCloud

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import re

In [None]:
df = pd.read_csv('UpdatedResumeDataSet.csv')
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [None]:
# the data does not seem to be shuffled, lets take a look at the total number of jobs for a better understanding
category = df['Category'].value_counts().reset_index()
category

In [None]:
# there also seems to be some random symbols in the description, lets clean it up a little bit
def cleanResume(resume): 
  resume = re.sub('http\S+\s*', ' ', resume) # removes links
  resume = re.sub('RT|cc', ' ', resume) # removes RT and cc
  resume = re.sub('#\S+', ' ', resume) # removes hashtags
  resume = re.sub('@\S+', ' ', resume) # removes mentions
  resume = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resume) # removes punctuation
  resume = re.sub(r'[^\x00-\x7f]',r' ', resume) 
  resume = re.sub('\s+',' ', resume) # removes the white space
  return resume 

In [None]:
df['clean'] = df['Resume'].apply(lambda x:cleanResume(x))
df.head()

Unnamed: 0,Category,Resume,clean
0,Data Science,Skills * Programming Languages: Python (pandas...,Skills Programming Languages Python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details May 2013 to May 2017 B E UIT...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control System...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA SQL ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",Education Details MCA YMCAUST Faridabad Haryan...


In [None]:
corpus =" "
for i in range(0,len(df)):
  corpus = corpus + df["clean"][i]

In [None]:
# this states that we are going to split the text into tokens through words
tokens = word_tokenize(corpus)
# tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
# now to split the text into tokens

In [None]:
words = []
# make everything lowercase so it is the same
for word in tokens:
  words.append(word.lower())
words[:5]

['skills', 'programming', 'languages', 'python', 'pandas']

In [None]:
# now time to encode the data
label = LabelEncoder()
df['new_category'] = label.fit_transform(df['Category'])
df.head()

Unnamed: 0,Category,Resume,clean,new_category
0,Data Science,Skills * Programming Languages: Python (pandas...,Skills Programming Languages Python pandas num...,6
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details May 2013 to May 2017 B E UIT...,6
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control System...,6
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA SQL ...,6
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",Education Details MCA YMCAUST Faridabad Haryan...,6


In [None]:
# now to turn these into input vectors
text = df['clean'].values
target = df['new_category'].values
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500)
word_vectorizer.fit(text)
WordFeatures = word_vectorizer.transform(text)

In [None]:
WordFeatures.shape

(962, 1500)

In [None]:
print(WordFeatures)

In [None]:
# now to train and test the data
x_train, x_test, y_train, y_test = train_test_split(WordFeatures, target, random_state=24, test_size=0.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((769, 1500), (193, 1500), (769,), (193,))

In [None]:
model = OneVsRestClassifier(KNeighborsClassifier())
model.fit(x_train, y_train)

OneVsRestClassifier(estimator=KNeighborsClassifier())

In [None]:
y_pred = model.predict(x_test)

In [None]:
print(f'---------------------------------\n| Training Accuracy   :- {(model.score(x_train, y_train)*100).round(2)}% |')
print(f'---------------------------------\n| Validation Accuracy :- {(model.score(x_test, y_test)*100).round(2)}% |\n---------------------------------')

---------------------------------
| Training Accuracy   :- 98.96% |
---------------------------------
| Validation Accuracy :- 96.89% |
---------------------------------


In [None]:
print(metrics.classification_report(y_test, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00         6
           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         3
           6       1.00      1.00      1.00         7
           7       1.00      1.00      1.00         6
           8       1.00      1.00      1.00        10
           9       1.00      1.00      1.00         6
          10       1.00      1.00      1.00        10
          11       1.00      1.00      1.00         8
          12       1.00      1.00      1.00         6
          13       1.00      1.00      1.00         6
          14       1.00      1.00      1.00         8
          15       1.00      1.00      1.00        16
          16       1.00      1.00      1.00         7
          17       1.00    

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
new_words = []
# removing the stop words from the word list
for word in words:
  if word not in stopwords:
    new_words.append(word)

In [None]:
new_words[:5]

['skills', 'programming', 'languages', 'python', 'pandas']

In [None]:
wn = WordNetLemmatizer()
lem_words = []
for word in new_words:
  word = wn.lemmatize(word)
  lem_words.append(word)

In [None]:
lem_words[:5]

['skill', 'programming', 'language', 'python', 'panda']

In [None]:
freq_dist = nltk.FreqDist(lem_words)
freq_dist

FreqDist({'project': 4071, 'exprience': 3829, 'company': 3635, 'month': 3344, 'detail': 3132, 'description': 3122, 'team': 2159, 'data': 2138, '1': 2134, 'management': 2024, ...})