In [32]:
!pip install POT



In [33]:
import pandas as pd
import numpy as np
import gdown
import gensim.downloader as api
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.spatial.distance import jaccard, dice, hamming
from scipy.stats import pearsonr, entropy
from sklearn.cluster import KMeans
from gensim.corpora import Dictionary
from gensim.similarities import WmdSimilarity
from gensim.models import KeyedVectors
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

url = "https://drive.google.com/uc?id=1y9qBtV71aY6OtEnhfQAxMTWXBBmdv63i"
output = "dataset.csv"
gdown.download(url, output, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1y9qBtV71aY6OtEnhfQAxMTWXBBmdv63i
To: /content/dataset.csv
100%|██████████| 93.0M/93.0M [00:00<00:00, 175MB/s]


'dataset.csv'

In [34]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
glove_vectors = api.load("glove-wiki-gigaword-300")

In [36]:
input_text = "At the age of 28, I, John Smith, am an enthusiastic and highly skilled data analyst seeking to advance my career by applying for a position in the data field. With over five years of experience in data analysis and a proven track record of successfully implementing data-driven strategies, I have honed my expertise in various analytical tools and methodologies. I hold a Bachelor's degree in Computer Science from the University of California, Berkeley, and a Master's degree in Data Science from Stanford University. Throughout my career, I have demonstrated exceptional analytical capabilities, strong problem-solving skills, and a deep understanding of statistical models and machine learning algorithms. My passion for uncovering insights from complex datasets and my commitment to continuous learning make me a valuable candidate for any data-driven organization."
input_text = input_text.lower()
input_text = re.sub(r'[^\w\s]', '', input_text)
tokens = word_tokenize(input_text)
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
input_text = ' '.join(tokens)

In [37]:
def get_average_glove(tokens, model, num_features):
    valid_words = [word for word in tokens if word in model]
    if not valid_words:
        return np.zeros(num_features)
    return np.mean([model[word] for word in valid_words], axis=0)

In [38]:
dataset = pd.read_csv(output)

In [39]:
corpus_tokens = [word_tokenize(re.sub(r'[^\w\s]', '', doc.lower())) for doc in dataset["deskripsi_keterampilan_final"]]
corpus_tokens = [[lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] for tokens in corpus_tokens]
num_features = 300
corpus_embeddings = np.array([get_average_glove(tokens, glove_vectors, num_features) for tokens in corpus_tokens])

In [40]:
input_tokens = word_tokenize(input_text)
input_embedding = get_average_glove(input_tokens, glove_vectors, num_features)

In [41]:
cosine_similarities = cosine_similarity([input_embedding], corpus_embeddings).flatten()
dataset["cosine_similarity"] = cosine_similarities

In [42]:
euclidean_distance = euclidean_distances([input_embedding], corpus_embeddings).flatten()
dataset["euclidean_distance"] = euclidean_distance

In [43]:
manhattan_distance = manhattan_distances([input_embedding], corpus_embeddings).flatten()
dataset["manhattan_distance"] = manhattan_distance

In [44]:
def jaccard_similarity(s1, s2):
    s1 = set(s1.split())
    s2 = set(s2.split())
    return len(s1 & s2) / len(s1 | s2)
jaccard_similarities = np.array([jaccard_similarity(' '.join(input_tokens), ' '.join(tokens)) for tokens in corpus_tokens])
dataset["jaccard_similarity"] = jaccard_similarities

In [45]:
def dice_coefficient(s1, s2):
    s1 = set(s1.split())
    s2 = set(s2.split())
    return 2 * len(s1 & s2) / (len(s1) + len(s2))
dice_coefficients = np.array([dice_coefficient(' '.join(input_tokens), ' '.join(tokens)) for tokens in corpus_tokens])
dataset["dice_coefficient"] = dice_coefficients

In [46]:
def pearson_corr(x, y):
    if np.std(x) == 0 or np.std(y) == 0:
        return 0
    return pearsonr(x, y)[0]
pearson_correlations = np.array([pearson_corr(input_embedding, emb) for emb in corpus_embeddings])
dataset["pearson_correlation"] = pearson_correlations

In [47]:
def wmd_distance(s1, s2, model):
    s1_words = s1.split()
    s2_words = s2.split()
    s1_words = [word for word in s1_words if word in model]
    s2_words = [word for word in s2_words if word in model]
    if not s1_words or not s2_words:
        return float('inf')
    return model.wmdistance(s1_words, s2_words)
wmd_distances = np.array([wmd_distance(' '.join(input_tokens), ' '.join(tokens), glove_vectors) for tokens in corpus_tokens])
dataset["wmd_distance"] = wmd_distances

In [48]:
def bhattacharyya_distance(p, q):
    p = np.sqrt(np.abs(p))
    q = np.sqrt(np.abs(q))
    return -np.log(np.sum(p * q) + 1e-10)
def bhattacharyya_distance_from_vector(x, y):
    p = x / (np.sum(x) + 1e-10)
    q = y / (np.sum(y) + 1e-10)
    return bhattacharyya_distance(p, q)
bhattacharyya_distances = np.array([bhattacharyya_distance_from_vector(input_embedding, emb) for emb in corpus_embeddings])
dataset["bhattacharyya_distance"] = bhattacharyya_distances

In [49]:
def kl_divergence(p, q):
    epsilon = 1e-10
    p = p + epsilon
    q = q + epsilon
    p = p / np.sum(p)
    q = q / np.sum(q)
    return entropy(p, q)
kl_divergences = np.array([kl_divergence(input_embedding, emb) for emb in corpus_embeddings])
dataset["kl_divergence"] = kl_divergences

# Hasil

In [50]:
pd.set_option("display.max_colwidth",None)

In [51]:
top_similar = dataset.sort_values(by='cosine_similarity', ascending=False).head(20)
top_similar[["skill","job title","cosine_similarity"]]

Unnamed: 0,skill,job title,cosine_similarity
13111,Information Technology,data science artificial intelligence machine learning internship,0.935107
9203,Other,senior bioinformatics research scientist bioinformatic research scientist,0.931558
22397,Research,group summer research program intern advanced concept technology,0.930636
19613,Analyst,data scientist,0.928304
29493,Education,researcher doctor of philosophy student,0.927795
4277,Science,senior data scientist,0.927549
4486,Science,senior data scientist,0.927549
19252,Information Technology,data scientist,0.927438
1784,Project Management,senior manager,0.927376
19401,Engineering,research development civil engineering,0.927259


In [52]:
top_similar['cosine_similarity']

13111    0.935107
9203     0.931558
22397    0.930636
19613    0.928304
29493    0.927795
4277     0.927549
4486     0.927549
19252    0.927438
1784     0.927376
19401    0.927259
1142     0.924037
5171     0.923820
6180     0.923580
23306    0.923451
7319     0.923152
28867    0.922734
23588    0.922225
30445    0.921715
4711     0.921482
30759    0.921435
Name: cosine_similarity, dtype: float64

In [53]:
top_similar = dataset.sort_values(by='euclidean_distance', ascending=True).head(10)
top_similar

Unnamed: 0,id_pekerjaan,id_perusahaan,skill,job title,jenis_pekerjaan,registration_type,category sponsor,city,state,county,deskripsi_keterampilan_final,cosine_similarity,euclidean_distance,manhattan_distance,jaccard_similarity,dice_coefficient,pearson_correlation,wmd_distance,bhattacharyya_distance,kl_divergence
13111,3701314004,71049830,Information Technology,data science artificial intelligence machine learning internship,Internship,ComplexOnsiteApply,sponsored,san francisco bay area,california,united states,company agile data pro digital lab provider data science solution harness power machine learning generative ai integrated platform help capture data multiple source analyze use create model predict business issue outcomes team phd hold multiple patent leverage latest technology deliver reliable efficient solution value customer job role data science ai ml internship unpaid location onsite san jose ca pm min day week unpaid internship extraordinary opportunity learn experienced professional provide mentorship guidance throughout internship internship gain valuable experience artificial intelligence ai machine leaning ml natural language processing nlp teamwork essential skill successful career field suggestive project base industry real life data develop gcp azure cloud also give chance practice work data scientist real life work datasets seek highly motivate analytical individual join team intern intern opportunity work project gain hand experience data analysis machine learning experimentation statistical modeling collaborate team data scientist help drive business growth requirement familiarity programming language python prefer java c basic understanding sql ability identify interpret domain heavily nuanced data able work multiple project meet quality standard deadline ability work independently part team education qualification look highly talented motivate undergraduate student pursue degree computer science computer engineering related technical field interested pursue opportunity please send updated resume sam agile data pro com,0.935107,1.00712,13.712642,0.099502,0.180995,0.935447,0.902333,-2.52175,inf
22397,3757722520,1504,Research,group summer research program intern advanced concept technology,Full-time,OffsiteApply,not sponsored,lexington,massachusetts,united states,advanced concept technology group look exceptional engineer scientist mathematician excite work challenge problem join team group develop technology protect deployed sailor soldier asset advanced missile threat enable operation rapidly evolve complex electronic warfare ew environment group research development activity include concept design advance algorithm development software hardware prototyping experimental field testing data analysis team focus create new capability rapid prototyping software hardware particular interest group development advanced electronic warfare system prototype advance sensor prototype novel signal processing technique resource allocation mathematical optimization approach artificial intelligence enable development next generation system fleet surface asset defense due wide ranging nature research dynamic group comprise diverse team scientist engineer expertise physic mathematics computer science various engineering background group value inclusiveness foster mentor level promote critical innovative thinking address need nation candidate part highly collaborative team thrive solve complex challenge foster innovation support career growth dedicate make positive impact nation job description summer research program intern need investigate design algorithm area classical machine learning bayesian inference work involve explore implementing compare performance use data simulation well measure data candidate comfortable read academic paper turn material high quality software tool current enrollment program working towards b phd degree mathematics computer science electrical engineering demonstrate effective organizational skill ability work independently team environment demonstrate strong write oral communication skills strong foundation probability linear algebra work knowledge statistical inference digital signal processing information theory prefer python julia programming experience strongly desire select candidate subject pre employment background investigation must able obtain maintain secret level dod security clearance mit lincoln laboratory equal employment opportunity eeo employer qualify applicant receive consideration employment discriminate basis race color religion sex sexual orientation gender identity national origin age veteran status disability status genetic information u citizenship require requisition id,0.930636,1.041231,14.18144,0.098113,0.178694,0.930595,0.947397,-2.730896,inf
9203,3699053978,15223686,Other,senior bioinformatics research scientist bioinformatic research scientist,Full-time,OffsiteApply,not sponsored,memphis,tennessee,united states,epigenetics group center applied bioinformatics st jude child research hospital look highly motivate creative senior bioinformatics research scientist research scientist develop apply innovative analytical approach understand underlying mechanism drive pediatric cancer human disease work different project together different pi employee could make valuable difference science also pediatric cancer patient curation survival besides various analysis task also encourage explore newly available method pipeline development could result first author high profile publication long align ultimate goal finding cure save child center provide highly interactive environment collaborative opportunity within center include group genomics group genetics group across basic clinical department access high performance compute cluster cloud compute environment novel sequence platform long read sequence technology single cell sequence dna rna level spatial innovative visualization tool highly automated analytical pipeline power gpu technology mentorship scientist deep experience data analysis data management delivery high quality result highly competitive project member center could work broad area bioinformatics gain valuable experience member center also opportunity publish high profile paper prominent authorship file patent novel bio marker discovery novel method clinical diagnosis disease treatment good candidate experience building optimize analysis pipeline use available software instal pipeline high throughput next generation sequence data analysis chip seq cut run atac seq rna seq hi c hi chip linux unix environment successful candidate expect good programming skill bash python r also understand basic epigenetics gene regulation mechanism e g histone modification chromatin accessibility dna methyl ation genome experience deep learning next flow plus st jude frequently make great place work list fortune magazine best workplace health care list glass door best place work list salary highly competitive comparable industry employee benefit outstanding position locate memphis tn relocation assistance available responsibility specific responsibility may involve ng data quality control integrative analysis data visualization develop evaluate analytic tools excellent communication skill essential you work closely st jude pi provide standardized customized analysis use st jude high performance compute cluster cloud computing environment minimum bachelor degree bioinformatics chem informatics statistic computer science background biological science chemistry relate field master degree phd preferred senior bioinformatics research scientist minimum requirement bachelor degree year relevant post degree work bioinformatics chem informatics statistic computer science background biological science chemistry experience exception master degree year relevant post degree experience phd year relevant post degree experience significant experience least one programming scripting language least one statistical package r preferred bioinformatics research scientist minimum requirement bachelor degree year relevant post degree work bioinformatics chem informatics statistic computer science background biological science chemistry experience exception master degree year relevant post degree experience phd experience significant experience least one programming scripting language least one statistical package python prefer information st jude equal opportunity employer search firm st jude child research hospital accept unsolicited assistance search firm employment opportunity please call email resume submit search firm employee representative st jude via email internet form method without valid write search agreement place approve hr result fee pay event candidate hire st jude,0.931558,1.044008,14.292675,0.072368,0.134969,0.932351,0.933437,-2.426454,inf
19613,3757709060,23099,Analyst,data scientist,Contract,ComplexOnsiteApply,not sponsored,houston,texas,united states,become essential member nrg data scientist offer opportunity view big picture retail operation marketing well develop specialized knowledge specific area nrg aim promote customized offering right product offer right channel right message right time current prospective customer accomplish leverage data via predictive modeling statistical analysis optimization love data programming creative problem solving communicate result fit right responsibility apply statistical modeling machine learn algorithm optimize marketing effort respect customer acquisition retention customer experience margin enhancement take leadership role mentor develop junior analysts collaborate closely marketing team plan analyze b tests utilize advance quantitative technique accurately quantify evaluate impact different marketing strategy lever produce model predict consumer behavior response various marketing levers help executive monitor pulse business automate report key performance indicators create impact ful meaningful visualization effectively communicate complex data insights translate communicate present result recommendation non technical audience requirement bachelor degree statistic computer science economics engineering mathematics operation research require advanced degree phd quantitative field strongly prefer statistic computer science economics engineering mathematics operation research year statistical modeling quantitative analysis industry full time academic research strong causal inference skill b testing propensity score matching etc proficiency bayesian statistic include experience apply bayesian method data analysis inference previous retail electricity market experience preferred previous marketing experience preferred experience analyze weblog data competition experience also plus technical skill requirement posse strong statistical modeling analysis skill demonstrate solid understanding graduate level multivariate statistical technique sample method include limited multivariate regression factor analysis cluster analysis principal component analysis exhibit strong general programming skill proficiency python include experience panda sci kit learn data analysis machine learning task possess strong sql skill understanding relational database excel proficiency ability effectively utilize advanced function feature r program plus survival model plus,0.928304,1.056395,14.012262,0.099138,0.180392,0.92891,0.899465,-2.478128,inf
19401,3749350135,9247671,Engineering,research development civil engineering,Full-time,ComplexOnsiteApply,not sponsored,vicksburg,mississippi,united states,job detail qualification strong knowledge experience gi geoscience geology cfd computational fluid dynamic proficiency program language python rr gi familiarity research mathematics research physical science atmospheric science experience high performance computing hpc parallel program understanding machine learn technique data analysis prediction excellent problem solve skill ability work large complex datasets strong analytical numerical computation skill ability collaborate effectively interdisciplinary team stakeholder excellent write verbal communication skill qualification write technical report journal article disseminate knowledge gain corp engineer project collaborate engineer scientist execution research reimbursable project determine exist future potential coastal engineering navigation research need foster collaboration engineer research development center academia write proposal document obtain research reimbursable fund assemble organize data narrative form presentation publication deliver research product report software analysis database u citizenship require unofficial job post,0.927259,1.06424,14.265179,0.070513,0.131737,0.927988,0.970092,-2.473662,inf
4277,3693581630,2276,Science,senior data scientist,Full-time,OffsiteApply,sponsored,south san francisco,california,united states,position strategic analytics intelligence sai team decipher data help solve world complex healthcare challenge improve life patient mix competitive intelligence market research data science advanced analytics access forecasting sai unlocks key insight internal partner ultimately benefit healthcare provider patient even never work biotech establish expert alongside specialist plus gain new experience across marketing discipline therapeutic area commercial operation entire time surround diverse inclusive team aim reflect world serve cd cmg team within sai exist help cmg commercial medical government affair organization achieve vision unlock value data quickly effectively center excellence cd team three primary remit owning advance data science strategy innovation sai cmg application data science cmg enterprise priority drive data science capability behalf sai department team leverage advance data science machine learning capability work across cmg develop strategic holistic solution identify lead innovative analytics project pilot enable deliver customer look generalist senior data scientist strong track record building machine learning ml natural language processing nlp driven application product ideal candidate also someone significant hand experience develop predictive model cd team commit deliver production level work build ml system enterprise mindset cd involve digital transformation build integrated customer experience icx utilize large range data source structure unstructured well ai ml algorithm find pattern deliver automated experience end user excellent senior data scientist cd team strong bias execution able operate independently great communicator innovator visionary major responsibility manage guide data science data engineering element ai ml projects drive development deployment industrialization enterprise application use advanced nlp technique generate value unstructured data commercial medical organization develop deep understanding customer market dynamic apply technical skill advance knowledge guide organization deliver product help create integrated customer experience drive impact help stakeholder define clear impact ful business priority possible use expertise exist analysis research influence guide apply ml applied statistic approach answer variety complex business question use multiple data source technical tool act subject matter expert applicable data science advanced analytical methodology program project particularly nlp collaborate within cross functional team develop efficient machine learn base application gain alignment deliver impact ful business insight engage necessary stakeholder enable better decision making openly share perspective insight elevate team thinking drive balanced holistic point view effectively weigh communicate trade consideration take enterprise mindset link individual responsibility broader organization focus outcome provide business value demonstrate bias execution self accountability result ownership look opportunity continuous improvement engage manager peer group regularly coaching assistance advocacy act thought partner advisor relevant team stakeholder look establish opportunity peer mentorship minimum qualification bachelor degree relevant discipline data science computer science apply mathematics statistic economics engineering related field focus artificial intelligence ai machine learning ml year relevant industry work experience year experience application nlp text analytics enterprise data demonstrated ability drive tangible results expertise following quantitative field nlp deep learning applied statistic causal inference b testing design experiments demonstrated proficiency leverage open source library package spark tensor flow kera stats models etc solve enterprise problem demonstrate proficiency leverage open source nlp library package gen sim spacy nltk hugging face etc demonstrate work knowledge recent advancement nlp model open source framework like llm prompt engineering lang chain etc solid foundational knowledge deep learning open source framework like tensor flow kera application focus nlp transformer sequence sequence learn task experience work large complex data use had oop spark big data platform proficiency industry standard program language r python experience data science cloud computing tool platform aws gcp etc strong knowledge secondary data source include syndicate sale promotional marketing data longitudinal patient level data experience payer data preferred proficiency use ml variety context revenue attribution optimization text classification cluster etc across cmg use case medical commercial digital experience data visualization tool tableau qlik data studio etc experience act strategic thought partner team demonstrate ability solve problem think outside box proven track record leadership time management project management teamwork strong attention detail experience translate research analysis communicate presentation write concise compelling business story influence decision strategy expect salary range position base primary location california actual pay determine base experience qualification geographic location job related factor permit law discretionary annual bonus may available base individual company performance position also qualify benefit detail link provide benefit position base south san francisco office offer hybrid work schedule work office majority time relocation benefit available posting equal opportunity employer embrace increasingly diverse world around u prohibits unlawful discrimination base race color religion gender sexual orientation gender identity expression national origin ancestry age disability marital status veteran status,0.927549,1.064837,14.070134,0.081448,0.150628,0.927613,0.927432,-2.596236,inf
4486,3693584453,2276,Science,senior data scientist,Full-time,OffsiteApply,not sponsored,south san francisco,california,united states,position strategic analytics intelligence sai team decipher data help solve world complex healthcare challenge improve life patient mix competitive intelligence market research data science advanced analytics access forecasting sai unlocks key insight internal partner ultimately benefit healthcare provider patient even never work biotech establish expert alongside specialist plus gain new experience across marketing discipline therapeutic area commercial operation entire time surround diverse inclusive team aim reflect world serve cd cmg team within sai exist help cmg commercial medical government affair organization achieve vision unlock value data quickly effectively center excellence cd team three primary remit owning advance data science strategy innovation sai cmg application data science cmg enterprise priority drive data science capability behalf sai department team leverage advance data science machine learning capability work across cmg develop strategic holistic solution identify lead innovative analytics project pilot enable deliver customer look generalist senior data scientist strong track record building machine learning ml natural language processing nlp driven application product ideal candidate also someone significant hand experience develop predictive model cd team commit deliver production level work build ml system enterprise mindset cd involve digital transformation build integrated customer experience icx utilize large range data source structure unstructured well ai ml algorithm find pattern deliver automated experience end user excellent senior data scientist cd team strong bias execution able operate independently great communicator innovator visionary major responsibility manage guide data science data engineering element ai ml projects drive development deployment industrialization enterprise application use advanced nlp technique generate value unstructured data commercial medical organization develop deep understanding customer market dynamic apply technical skill advance knowledge guide organization deliver product help create integrated customer experience drive impact help stakeholder define clear impact ful business priority possible use expertise exist analysis research influence guide apply ml applied statistic approach answer variety complex business question use multiple data source technical tool act subject matter expert applicable data science advanced analytical methodology program project particularly nlp collaborate within cross functional team develop efficient machine learn base application gain alignment deliver impact ful business insight engage necessary stakeholder enable better decision making openly share perspective insight elevate team thinking drive balanced holistic point view effectively weigh communicate trade consideration take enterprise mindset link individual responsibility broader organization focus outcome provide business value demonstrate bias execution self accountability result ownership look opportunity continuous improvement engage manager peer group regularly coaching assistance advocacy act thought partner advisor relevant team stakeholder look establish opportunity peer mentorship minimum qualification bachelor degree relevant discipline data science computer science apply mathematics statistic economics engineering related field focus artificial intelligence ai machine learning ml year relevant industry work experience year experience application nlp text analytics enterprise data demonstrated ability drive tangible results expertise following quantitative field nlp deep learning applied statistic causal inference b testing design experiments demonstrated proficiency leverage open source library package spark tensor flow kera stats models etc solve enterprise problem demonstrate proficiency leverage open source nlp library package gen sim spacy nltk hugging face etc demonstrate work knowledge recent advancement nlp model open source framework like llm prompt engineering lang chain etc solid foundational knowledge deep learning open source framework like tensor flow kera application focus nlp transformer sequence sequence learn task experience work large complex data use had oop spark big data platform proficiency industry standard program language r python experience data science cloud computing tool platform aws gcp etc strong knowledge secondary data source include syndicate sale promotional marketing data longitudinal patient level data experience payer data preferred proficiency use ml variety context revenue attribution optimization text classification cluster etc across cmg use case medical commercial digital experience data visualization tool tableau qlik data studio etc experience act strategic thought partner team demonstrate ability solve problem think outside box proven track record leadership time management project management teamwork strong attention detail experience translate research analysis communicate presentation write concise compelling business story influence decision strategy expect salary range position base primary location california actual pay determine base experience qualification geographic location job related factor permit law discretionary annual bonus may available base individual company performance position also qualify benefit detail link provide benefit position base south san francisco office offer hybrid work schedule work office majority time relocation benefit available posting equal opportunity employer embrace increasingly diverse world around u prohibits unlawful discrimination base race color religion gender sexual orientation gender identity expression national origin ancestry age disability marital status veteran status,0.927549,1.064837,14.070134,0.081448,0.150628,0.927613,0.927432,-2.596236,inf
19252,3757450438,871133,Information Technology,data scientist,Full-time,ComplexOnsiteApply,not sponsored,new york,new york,united states,henderson harbor group premier executive search consult firm deep experience technology finance accounting tax search division recruit highly skilled professional direct hire basis service client primarily tri state area well nationally aafa npa network data scientist forefront data drive decision make process work closely cross functional team extract valuable insight data build predictive model provide actionable recommendation drive business forward key responsibilities collect pre process data various source ensure data quality accuracy explore analyze visualize data identify trend pattern insight develop machine learning model algorithm predictive prescriptive analytics collaborate domain expert stakeholder define business problem objective design experiment conduct statistical analysis test hypothesis validate model evaluate select appropriate tool framework library data analysis model create data drive report dashboard communicate finding recommendation stay date latest data science technique technology participate development data science strategy road maps contribute data related project mentor junior data scientist provide technical guidance maintain strong focus data privacy ethic compliance qualifications bachelor master degree quantitative field computer science statistic data science proven experience data scientist similar role proficiency data analysis visualization tool python r sql strong knowledge machine learning statistic data mining experience data manipulation modeling library e g panda sci kit learn tensor flow strong problem solving analytical skill excellent communication presentation skill ability work effectively collaborative team environment knowledge big data technology distribute computing plus industry specific experience may require base role e g healthcare finance e commerce etc,0.927438,1.072455,14.617533,0.125,0.222222,0.927703,0.851335,-2.548778,inf
1784,3693049684,18581793,Project Management,senior manager,Full-time,ComplexOnsiteApply,sponsored,san francisco bay area,california,united states,description group leader provide project management technology advisory primarily public agency fortune entity group manage various construction technology project range size small project multi billion dollar capital program enjoy excellence found intersection technology construction group set hire best personnel industry combination eq iq culture promote humility top tier work product level customer service best class group seek highly motivate detail orient senior manager key responsibility lead manage team data analyst data scientist business intelligence professional provide guidance mentorship performance management collaborate client consultant stakeholder understand business objective identify opportunity leverage data intelligence drive strategic decision making operational efficiency develop execute comprehensive data strategy include data collection storage processing analysis enable effective data driven insight solution oversee design development implementation advanced analytics model predictive model machine learning algorithm data visualization technique extract insight drive actionable outcome guide team utilize advanced analytics tool platform python r sql machine learning library perform complex data analysis modeling stay date emerge trend advancement data intelligence artificial intelligence machine learning big data technology apply solve complex business challenge collaborate technical team ensure data integration data governance data quality standard meet throughout project lifecycle work closely business development consult team identify new business opportunity develop proposal deliver client presentation provide strategic guidance client data drive decision making data governance best practice data intelligence develop maintain strong relationship client act trusted advisor thought leader data intelligence requirement bachelor master degree computer science data science statistic related field minimum year experience data analytics business intelligence data intelligence least year leadership managerial role strong expertise advanced analytics machine learning statistical modeling data visualization technique proficiency program language python r hand experience utilizing machine learn library framework experience managing lead team include hiring training performance management solid understanding data governance data management data integration principles strong business acumen ability translate complex data insight actionable business recommendation excellent communication presentation skill ability effectively convey complex concept technical non technical stakeholder prove track record deliver successful data intelligence project drive business outcome strong problem solve skill ability think strategically analytically self motivate proactive strong sense ownership accountability preferred qualification advanced degree ph computer science data science related field experience management consulting firm work consult client familiarity big data technology had oop spark cloud platform aws azure gcp knowledge data visualization tool tableau power bi etc data warehouse concept group equal opportunity organization allow discrimination base upon age ethnicity ancestry gender national origin disability race size religion sexual orientation socioeconomic background status prohibit applicable law,0.927376,1.075436,14.314301,0.108014,0.194969,0.927726,0.881151,-2.514683,inf
1142,3693046569,10577525,Project Management,data scientist remote,Full-time,ComplexOnsiteApply,not sponsored,new york,new york,united states,data scientist grow proud woman management consulting firm provide program project management capital project delivery consulting technology solution customer renewable energy power utility infrastructure transit transportation sectors data scientist advocate evangelize build data fuel product help customer improve capital project delivery dig become expert energy sector datasets provide insight lead analytic practice design lead iterative learning development cycle ultimately produce new creative analytic solution become part core data scientist work cross functional team member identify prioritize actionable high impact insight across variety core business area lead applied analytics initiative leverage across breadth solution technology sector research design implement validate cutting edge algorithm analyze diverse source data achieve target outcome provide expertise mathematical concept broader applied analytics team inspire adoption advanced analytics data science across entire breadth organization candidate requirement bachelor degree operation research apply statistic data mining machine learning physic related quantitative discipline deep understanding statistical predictive modeling concept machine learning approach cluster classification technique recommendation optimization algorithm year experience deliver world class data science outcome data scientist solve complex analytical problem use quantitative approach unique blend analytical mathematical technical skill passionate ask answer question large datasets communicate passion product manager engineer keen desire solve business problem live find pattern insight within structure unstructured data propose analytics strategy solution challenge expand thinking everyone around expert analyze large complex multi dimensional datasets variety tool expert use statistical analysis environment r mat lab spss sa experience bi tool experience relational database had oop base data mining framework experience sql python java c education bachelor degree higher take pride equal opportunity employer discriminate employee applicant employment race color sex age national origin religion sexual orientation citizenship gender expression identity status veteran basis disability federal state local legally protected class wbe sbe dbe commit provide employment opportunity woman veteran underrepresented minority,0.924037,1.085602,14.942498,0.098425,0.179211,0.924083,0.900602,-2.641717,inf


In [54]:
top_similar = dataset.sort_values(by='manhattan_distance', ascending=True).head(10)
top_similar["manhattan_distance"]

13111    13.712642
19613    14.012262
4277     14.070134
4486     14.070134
22397    14.181440
19401    14.265179
9203     14.292675
1784     14.314301
23306    14.598262
19252    14.617533
Name: manhattan_distance, dtype: float64

In [55]:
top_similar = dataset.sort_values(by='jaccard_similarity', ascending=False).head(20)
top_similar[["skill","job title","jaccard_similarity"]]

Unnamed: 0,skill,job title,jaccard_similarity
22536,Engineering,senior software scientist medical imaging,0.152174
9283,Information Technology,tableau architect,0.144578
12904,Engineering,financial planning consultant,0.137931
24110,Engineering,data scientist,0.134021
17502,Engineering,tech lead data analytics,0.132184
3405,Product Management,product manager,0.131455
2243,Information Technology,business intelligence lead,0.128571
24805,Research,business analyst,0.128571
2312,Information Technology,natural language processing engineer java,0.126506
13387,Engineering,senior director technology,0.126087


In [56]:
top_similar = dataset.sort_values(by='dice_coefficient', ascending=False).head(10)
top_similar["dice_coefficient"]

22536    0.264151
9283     0.252632
12904    0.242424
24110    0.236364
17502    0.233503
3405     0.232365
2243     0.227848
24805    0.227848
2312     0.224599
13387    0.223938
Name: dice_coefficient, dtype: float64

In [57]:
top_similar = dataset.sort_values(by='pearson_correlation', ascending=False).head(10)
top_similar["pearson_correlation"]

13111    0.935447
9203     0.932351
22397    0.930595
19613    0.928910
19401    0.927988
29493    0.927784
1784     0.927726
19252    0.927703
4486     0.927613
4277     0.927613
Name: pearson_correlation, dtype: float64

In [58]:
filtered_dataset = dataset[dataset["wmd_distance"] != float('inf')]
top_similar = filtered_dataset.sort_values(by='wmd_distance', ascending=True).head(10)
top_similar["wmd_distance"]

19252    0.851335
18373    0.858288
1885     0.869854
12904    0.873789
17502    0.876112
9283     0.876529
28867    0.877625
1784     0.881151
30759    0.886625
30445    0.889444
Name: wmd_distance, dtype: float64

In [59]:
top_similar = dataset.sort_values(by='bhattacharyya_distance', ascending=True).head(10)
top_similar["bhattacharyya_distance"]

11720   -5.856735
15245   -5.764600
6985    -5.466965
24799   -5.385907
490     -5.354042
20477   -5.311746
10681   -5.286391
24857   -5.234039
11325   -5.226479
31511   -5.180870
Name: bhattacharyya_distance, dtype: float64

In [60]:
filtered_dataset = dataset[dataset["kl_divergence"] != float('inf')]
top_similar = filtered_dataset.sort_values(by='kl_divergence', ascending=True).head(10)
top_similar["kl_divergence"]

Series([], Name: kl_divergence, dtype: float64)