In [22]:
from pyresparser import ResumeParser
import pandas as pd
import re
from ftfy import fix_text
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [23]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [24]:
stopw = set(stopwords.words('english'))

df = pd.read_csv('../../others/job_final.csv')
df['test'] = df['Job_Description'].apply(
  lambda x: ' '.join(
    [word for word in str(x).split() if len(word) > 2 and word not in (stopw)]
    )
  )

In [39]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    
    return resumeText

In [25]:
def ngrams(string, n=3):
    string = fix_text(string)  # fix text
    # remove non ascii chars
    string = string.encode("ascii", errors="ignore").decode()
    string = string.lower()
    chars_to_remove = [")", "(", ".", "|", "[", "]", "{", "}", "'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()  
    # normalise case - capital at start of each word
    # get rid of multiple spaces and replace with a single
    string = re.sub(' +', ' ', string).strip()
    string = ' ' + string + ' '  # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    
    return [''.join(ngram) for ngram in ngrams]

In [26]:
def getNearestN(vectorizer, nbrs, query):
    queryTFIDF_ = vectorizer.transform(query)
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    return distances, indices

In [41]:
data = ResumeParser("../resumes/Sarath_Resume.pdf").get_extracted_data()

resume = data['skills']

skills = []
skills.append(' '.join(word for word in resume))
org_name_clean = skills
org_name_clean[0] += " " + data['name']

print(org_name_clean)

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)

tfidf = vectorizer.fit_transform(org_name_clean)
print('Vecorizing completed...')

# Unsupervised learner for implementing neighbor searches.
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
unique_org = (df['test'].values)

# distances, indices
distances, indices = getNearestN(vectorizer, nbrs, unique_org)
unique_org = list(unique_org)

matches = []

for i, j in enumerate(indices):
    dist = round(distances[i][0], 2)

    temp = [dist]
    matches.append(temp)
    
matches = pd.DataFrame(matches, columns=['Match confidence'])
df['match'] = matches['Match confidence']
df1 = df.sort_values('match')
df2 = df1[['Position', 'Company', 'Location']].head(10).reset_index()

print(df2)



['Sql Html5 Javascript Css Video Python Website Php Ui Ux Computer science R C++ C Ai Editing P Programming Computer Science']
Vecorizing completed...
   index                         Position                 Company      Location
0  1886   Lead Frontend Developer          Meaww                   Bengaluru   
1  532    Web Developer                    Netrovert Software       – Bengaluru
2  601    Web Designer                     VR CAREERZ               – Chennai  
3  1883   Full Stack Developer             Netrovert Software      Bengaluru   
4  1656   Front End Developer              Netrovert Software      Bengaluru   
5  1855   Senior UI Developer              Riversand Technologies  Bengaluru   
6  1401   Frontend Developer               Bengaluru               Bengaluru   
7  1384   Frontend Developer               Play Games24x7          Bengaluru   
8  1402   Front End Developer              Bengaluru               Bengaluru   
9  900    Data Scientist / Scala Engineer  IQLECT