In [2]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\varsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
resume_df = pd.read_csv("../data/Resume.csv")
job_df = pd.read_csv("../data/DataScientist.csv")

resume_df.head()


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)


In [7]:
resume_df['clean_resume'] = resume_df.iloc[:, 0].apply(clean_text)
job_df['clean_jd'] = job_df.iloc[:, 0].apply(clean_text)


In [8]:
skills_list = [
    'python','sql','machine learning','deep learning',
    'nlp','data analysis','data visualization',
    'tensorflow','pandas','numpy','scikit learn'
]

def extract_skills(text):
    return list(set([skill for skill in skills_list if skill in text]))

resume_df['skills'] = resume_df['clean_resume'].apply(extract_skills)


In [9]:
resume_text_col = resume_df.columns[0]
job_text_col = job_df.columns[0]

resume_df['clean_resume'] = resume_df[resume_text_col].astype(str).apply(clean_text)
job_df['clean_jd'] = job_df[job_text_col].astype(str).apply(clean_text)



In [10]:
resume_df['clean_resume'].head(10)


0    
1    
2    
3    
4    
5    
6    
7    
8    
9    
Name: clean_resume, dtype: object

In [11]:
print("Empty resumes:", (resume_df['clean_resume'].str.strip() == "").sum())
print("Empty job descriptions:", (job_df['clean_jd'].str.strip() == "").sum())


Empty resumes: 2484
Empty job descriptions: 3909


In [12]:
resume_df = resume_df[resume_df['clean_resume'].str.strip() != ""]
job_df = job_df[job_df['clean_jd'].str.strip() != ""]


In [13]:
combined_text = resume_df['clean_resume'].tolist() + job_df['clean_jd'].tolist()


In [14]:
resume_df.head()



Unnamed: 0,ID,Resume_str,Resume_html,Category,clean_resume,skills


In [15]:
job_df.head()


Unnamed: 0.1,Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply,clean_jd


In [16]:
job_df['clean_jd'].head(10)


Series([], Name: clean_jd, dtype: object)

In [17]:
resume_df = resume_df[resume_df['clean_resume'].str.len() > 3]
job_df = job_df[job_df['clean_jd'].str.len() > 3]

resume_df.reset_index(drop=True, inplace=True)
job_df.reset_index(drop=True, inplace=True)


In [18]:
print("Resume shape:", resume_df.shape)
print("Job shape:", job_df.shape)


Resume shape: (0, 6)
Job shape: (0, 18)


In [19]:
print("Resume columns:", resume_df.columns)
print("Job columns:", job_df.columns)


Resume columns: Index(['ID', 'Resume_str', 'Resume_html', 'Category', 'clean_resume',
       'skills'],
      dtype='object')
Job columns: Index(['Unnamed: 0', 'index', 'Job Title', 'Salary Estimate',
       'Job Description', 'Rating', 'Company Name', 'Location', 'Headquarters',
       'Size', 'Founded', 'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'Competitors', 'Easy Apply', 'clean_jd'],
      dtype='object')


In [20]:
# CHANGE column names if needed
resume_text_column = resume_df.columns[-1]
job_text_column = job_df.columns[-1]

resume_df['clean_resume'] = resume_df[resume_text_column].astype(str).apply(clean_text)
job_df['clean_jd'] = job_df[job_text_column].astype(str).apply(clean_text)


In [21]:
resume_df['clean_resume'].head(5)


Series([], Name: clean_resume, dtype: object)

In [22]:
job_df['clean_jd'].head(5)


Series([], Name: clean_jd, dtype: object)

In [23]:
job_df.head()


Unnamed: 0.1,Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply,clean_jd


In [24]:
job_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         0 non-null      int64  
 1   index              0 non-null      int64  
 2   Job Title          0 non-null      object 
 3   Salary Estimate    0 non-null      object 
 4   Job Description    0 non-null      object 
 5   Rating             0 non-null      float64
 6   Company Name       0 non-null      object 
 7   Location           0 non-null      object 
 8   Headquarters       0 non-null      object 
 9   Size               0 non-null      object 
 10  Founded            0 non-null      int64  
 11  Type of ownership  0 non-null      object 
 12  Industry           0 non-null      object 
 13  Sector             0 non-null      object 
 14  Revenue            0 non-null      object 
 15  Competitors        0 non-null      object 
 16  Easy Apply         0 non-null      obj

In [25]:
print(job_df.columns)


Index(['Unnamed: 0', 'index', 'Job Title', 'Salary Estimate',
       'Job Description', 'Rating', 'Company Name', 'Location', 'Headquarters',
       'Size', 'Founded', 'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'Competitors', 'Easy Apply', 'clean_jd'],
      dtype='object')


In [26]:
job_df.drop(columns=['clean_jd'], inplace=True)


In [27]:
job_df['clean_jd'] = job_df['Job Description'].astype(str).apply(clean_text)


In [28]:
job_df[['Job Description', 'clean_jd']].head(3)


Unnamed: 0,Job Description,clean_jd


In [29]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = text.lower()
    text = re.sub(r'\n', ' ', text)          # keep line breaks safe
    text = re.sub(r'[^a-zA-Z ]', ' ', text)  # remove symbols
    text = re.sub(r'\s+', ' ', text)         # normalize spaces
    
    words = text.split()
    
    # ⚠️ IMPORTANT: do NOT remove all stopwords
    words = [
        lemmatizer.lemmatize(w)
        for w in words
        if len(w) > 2
    ]
    
    return " ".join(words)


In [30]:
job_df['clean_jd'] = job_df['Job Description'].apply(clean_text)


In [32]:
job_df[['Job Description', 'clean_jd']].head(3).to_string()


'Empty DataFrame\nColumns: [Job Description, clean_jd]\nIndex: []'

In [33]:
job_df = pd.read_csv("../data/DataScientist.csv")


In [34]:
import pandas as pd

resume_df = pd.read_csv("../data/Resume.csv")
job_df = pd.read_csv("../data/DataScientist.csv")


In [35]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = [lemmatizer.lemmatize(w) for w in text.split() if len(w) > 2]
    return " ".join(words)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\varsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [36]:
resume_df['clean_resume'] = resume_df['Resume_str'].apply(clean_text)
job_df['clean_jd'] = job_df['Job Description'].apply(clean_text)


In [37]:
resume_df[['Resume_str', 'clean_resume']].head(2).to_string()


"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [38]:
job_df[['Job Description', 'clean_jd']].head(2).to_string()


'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [39]:
print(resume_df.columns)
print(job_df.columns)


Index(['ID', 'Resume_str', 'Resume_html', 'Category', 'clean_resume'], dtype='object')
Index(['Unnamed: 0', 'index', 'Job Title', 'Salary Estimate',
       'Job Description', 'Rating', 'Company Name', 'Location', 'Headquarters',
       'Size', 'Founded', 'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'Competitors', 'Easy Apply', 'clean_jd'],
      dtype='object')


In [40]:
resume_df['clean_resume'].str.len().describe()


count     2484.000000
mean      5272.511675
std       2387.353099
min          0.000000
25%       4289.750000
50%       4910.500000
75%       6098.750000
max      31222.000000
Name: clean_resume, dtype: float64

In [41]:
print("Resume rows:", resume_df.shape)
print("Job rows:", job_df.shape)


Resume rows: (2484, 5)
Job rows: (3909, 18)


In [42]:
job_df['clean_jd'].str.len().describe()


count     3909.000000
mean      3168.457150
std       1782.303135
min         41.000000
25%       1880.000000
50%       2935.000000
75%       4150.000000
max      17496.000000
Name: clean_jd, dtype: float64

In [43]:
combined_text = (
    resume_df['clean_resume'].astype(str).tolist() +
    job_df['clean_jd'].astype(str).tolist()
)

print("Total documents:", len(combined_text))


Total documents: 6393


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000
)

tfidf_matrix = vectorizer.fit_transform(combined_text)
tfidf_matrix.shape


(6393, 5000)

In [45]:
resume_vectors = tfidf_matrix[:len(resume_df)]
job_vectors = tfidf_matrix[len(resume_df):]


In [46]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(resume_vectors, job_vectors)
similarity_matrix.shape


(2484, 3909)

In [47]:
import pandas as pd

resume_index = 0  # first resume
top_jobs = similarity_matrix[resume_index].argsort()[-5:][::-1]

job_df.iloc[top_jobs][['Job Title', 'Company Name', 'Location']]


Unnamed: 0,Job Title,Company Name,Location
3220,Data Scientist (Marketing Analytics),Whole Foods Market\n3.6,"Austin, TX"
2037,Marketing & Data Analyst,Principle Auto\n4.1,"San Antonio, TX"
1077,Quantitative Analyst,"Numeric, LLC\n3.2","Houston, TX"
1330,"Data Analyst, Marketing",National Education Partners\n4.6,"Scottsdale, AZ"
3347,Marketing Data Analyst,Show Me Leads,"Austin, TX"


In [48]:
scores = similarity_matrix[resume_index][top_jobs] * 100

result = job_df.iloc[top_jobs][['Job Title', 'Company Name']]
result['Match %'] = scores.round(2)

result


Unnamed: 0,Job Title,Company Name,Match %
3220,Data Scientist (Marketing Analytics),Whole Foods Market\n3.6,25.04
2037,Marketing & Data Analyst,Principle Auto\n4.1,23.8
1077,Quantitative Analyst,"Numeric, LLC\n3.2",23.46
1330,"Data Analyst, Marketing",National Education Partners\n4.6,21.88
3347,Marketing Data Analyst,Show Me Leads,20.6


In [49]:
import pickle

with open("vectors.pkl", "wb") as f:
    pickle.dump((resume_vectors, job_vectors), f)
