In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
data_resume = pd.read_csv('/Users/harshil/Desktop/Academic2023/BIA660/UpdatedResumeDataSet.csv')

In [3]:
data_resume.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [4]:
data_scraped = pd.read_csv('/Users/harshil/Desktop/Academic2023/BIA660/ScrapedJobs.csv', index_col = 0)

In [5]:
data_scraped.head()

Unnamed: 0,Title,Company/Location,Description
0,Data Scientist,CommitRemote,SkillsDo you have experience in Web services?Y...
1,Data Analyst / Data Scientist,"DATSURAWashington, DC 20549 (NoMa area)Un Stat...",SkillsDo you have experience in Technical writ...
2,Senior Data Engineer (Machine Learning),Precision Systems4.8Remote,SkillsDo you have experience in Spark?YesNo&nb...
3,Data Scientist (Remote Eligible),Mathematica Policy ResearchRemote in Princeton...,SkillsDo you have experience in Tableau?YesNoE...
4,Data Scientist Positions,"Princeton UniversityPrinceton, NJ 08544",SkillsDo you have experience in Research?YesNo...


In [6]:
print(len(data_resume))
print(len(data_scraped))

962
120


In [7]:
def preprocess_text(text, remove_html=True, remove_punctuation=True, remove_special_chars=True,
                    remove_stopwords=True, stemming=False, lemmatization=False):
    
    # Remove HTML tags
    if remove_html:
        text = BeautifulSoup(text, "html.parser").get_text()
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation
    if remove_punctuation:
        tokens = [word for word in tokens if word not in string.punctuation]
    
    # Remove special characters
    if remove_special_chars:
        tokens = [word for word in tokens if word.isalnum()]
    
    # Remove stopwords
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    if stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    if lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a string
    processed_text = ' '.join(tokens)
    
    return processed_text

def preprocess_dataframe(df, columns, **kwargs):
    for col in columns:
        df[col] = df[col].apply(lambda x: preprocess_text(x, **kwargs) if pd.notnull(x) else '')
    return df

In [8]:
resume_preprocessed_df = preprocess_dataframe(data_resume, columns=['Resume'], remove_html = True, 
                                              remove_punctuation = True, remove_special_chars = True,
                                              remove_stopwords = True, stemming = False, lemmatization = True)

In [9]:
resume_preprocessed_df.head()

Unnamed: 0,Category,Resume
0,Data Science,skill programming language python panda numpy ...
1,Data Science,education detail may 2013 may 2017 data scient...
2,Data Science,area interest deep learning control system des...
3,Data Science,skill r python sap hana tableau sap hana sql s...
4,Data Science,education detail mca ymcaust faridabad haryana...


In [10]:
print(resume_preprocessed_df['Resume'].values[0])

skill programming language python panda numpy scipy matplotlib sql java machine learning regression svm bayes knn random forest decision tree boosting technique cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural net database visualization mysql sqlserver cassandra hbase elasticsearch plotly kibana matplotlib ggplot tableau others regular expression html cs angular 6 logstash kafka python flask git docker computer vision open cv understanding deep detail data science assurance associate data science assurance associate ernst young llp skill detail exprience 24 month exprience 24 month exprience 24 monthscompany detail company ernst young llp description fraud investigation dispute service assurance technology assisted review tar technology assisted review assist accelerating review process run analytics generate report core member team helped developing automated review platform tool scratch assistin

In [11]:
scraped_preprocessed_df = preprocess_dataframe(data_scraped, columns=['Company/Location', 'Description'], 
                                               remove_html = True, remove_punctuation = True, 
                                               remove_special_chars = True, remove_stopwords = True, 
                                               stemming = False, lemmatization = True)

In [12]:
scraped_preprocessed_df

Unnamed: 0,Title,Company/Location,Description
0,Data Scientist,commitremote,skillsdo experience web service yesno job deta...
1,Data Analyst / Data Scientist,datsurawashington dc 20549 noma area un statio...,skillsdo experience technical writing yesno jo...
2,Senior Data Engineer (Machine Learning),precision,skillsdo experience spark yesno job detailsher...
3,Data Scientist (Remote Eligible),mathematica policy researchremote princeton nj...,skillsdo experience tableau yesnoeducationdo b...
4,Data Scientist Positions,princeton universityprinceton nj 08544,skillsdo experience research yesnoeducationdo ...
...,...,...,...
115,Help Desk Technician,oak integrated caremount holly nj,skillsdo experience window yesnoeducationdo as...
116,Service Desk Analyst - Level 1,wynne systemremote,skillsdo experience analysis skill yesno job d...
117,Information Technology Manager,harvard club bostonhybrid remote boston 02215,skillsdo experience software troubleshooting y...
118,System Administrator IV,chenhalls va,skillsdo experience visual basic yesno job det...


In [13]:
print(scraped_preprocessed_df['Company/Location'].values[0])

commitremote


In [14]:
print(scraped_preprocessed_df['Description'].values[0])

skillsdo experience web service yesno job detailshere job detail align yearjob schedule8 hour shift locationremotebenefitspulled full job description401 k dental insurancehealth insurancefull job descriptionresponsibilities collecting large set structured unstructured data various source developing algorithm analyze data apply machine learning technique cleaning validating data ensure accuracy completeness uniformity devising utilizing algorithm model mine big data store perform data error analysis improve model clean validate data uniformity accuracy analyzing data identify pattern trend interpreting data discover solution opportunity communicating finding stakeholder using visualization mean qualification advanced degree computer science statistic mathematics related field proficiency data mining mathematics statistical analysis advanced pattern recognition predictive modeling experience experience sql programming language python java familiarity machine learning framework like kera 