In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Candidates

- Position - manual written position title
- Moreinfo - manual candidate's info
- Looking For - manual written what people search (not useful info for us)
- Highlights - manual written candidate's highlights
- Primary Keyword - job profile type
- English Level - candidate's English level
- Experience Years - candidate's experience in years

## Preprocess Logic:
- Drop duplicates
- Drop rows with empty Position after cleaning
- Drop 0.05 quantile of CV length
- Drop CV duplicates

In [2]:
df_candidates = pd.read_csv('../data/djinni/candidates.csv')

In [3]:
df_candidates.head()

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years
0,=,Маю досвід роботи рекрутером більше 1 року. \r...,,"Пройдений курс ""Introduction to Machine Learni...",Marketing,upper,0.0
1,_,_____,_,_,Other,upper,11.0
2,_,Studied EPAM 'IT switch course' 2022 which inc...,,,Java,intermediate,0.0
3,_,Наразі досвід є лише в якихось університетськи...,,"Навчаюся в університеті на ""відмінно"", легко д...",Sales,pre,0.0
4,__,__,,,,intermediate,6.0


In [4]:
df_candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295094 entries, 0 to 295093
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Position          295091 non-null  object 
 1   Moreinfo          295093 non-null  object 
 2   Looking For       150551 non-null  object 
 3   Highlights        148498 non-null  object 
 4   Primary Keyword   294888 non-null  object 
 5   English Level     291682 non-null  object 
 6   Experience Years  295094 non-null  float64
dtypes: float64(1), object(6)
memory usage: 15.8+ MB


## Possible candidates

In [5]:
df_candidates['Position'].value_counts().head(20)

Position
QA Engineer                   8987
Project Manager               8415
Front-end developer           7142
Junior QA Engineer            6925
Junior Front-end Developer    5076
Java Developer                4754
UI/UX Designer                4649
Manual QA Engineer            4364
Front-End Developer           4134
UX/UI Designer                3318
IT Recruiter                  3314
Разработчик PHP               2862
Python Developer              2739
Frontend Developer            2676
Business Analyst              2495
PHP Developer                 2423
Full Stack Web Developer      2253
iOS Developer                 2092
HR manager                    2059
Graphic Designer              2045
Name: count, dtype: int64

In [6]:
df_candidates['Position'].value_counts().tail(20)

Position
IT Recruitment Consultant                             1
IT Recruitment | Freelance                            1
IT Recruitment Lead                                   1
IT Recruitment Lead, HRM, HRD                         1
IT Recruitment Manager                                1
IT-Recruter                                           1
IT Recruitment Manager or IT Recruitmen Consultant    1
IT Recruitment/PowerBi analyst                        1
IT Recruitment researcher                             1
IT recruitment researcher, HR                         1
IT Recruitment Researcher, People Partner             1
IT Recruitment Researcher/Recruiter                   1
IT Recruitment Researcher (Technical Sourcer)         1
IT Recruitment Sourcer                                1
IT RECRUITMENT SPECIALIST                             1
IT Recruitment specialist/HR                          1
IT Recruitment Team Lead                              1
IT Recruteir/Researcher                

In [8]:
# number of unique positions
print('Number of unique positions: ', len(df_candidates['Position'].unique()))

Number of unique positions:  70646


In [10]:
# quartiles of positions count
df_candidates['Position'].value_counts().quantile([0.05,0.25, 0.5, 0.75, 0.95])

0.05    1.0
0.25    1.0
0.50    1.0
0.75    1.0
0.95    4.0
Name: count, dtype: float64

In [24]:
# clean all possible symbols from positions
df_candidates['Position_cleaned'] = df_candidates['Position'].str.replace('[^a-zA-Zа-яА-Я0-9 ]', '', regex=True).str.strip()

# empty positions equal to None
df_candidates['Position_cleaned'] = df_candidates['Position_cleaned'].replace('', None)

# number of unique positions
print('Number of unique positions: ', len(df_candidates['Position_cleaned'].unique()))

# number of empty positions
print('Number of empty positions: ', len(df_candidates[df_candidates['Position_cleaned'].isnull()]))

# show empty positions
df_candidates[df_candidates['Position_cleaned'].isnull()]

Number of unique positions:  67187
Number of empty positions:  9


Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned
0,=,Маю досвід роботи рекрутером більше 1 року. \r...,,"Пройдений курс ""Introduction to Machine Learni...",Marketing,upper,0.0,
1,_,_____,_,_,Other,upper,11.0,
2,_,Studied EPAM 'IT switch course' 2022 which inc...,,,Java,intermediate,0.0,
3,_,Наразі досвід є лише в якихось університетськи...,,"Навчаюся в університеті на ""відмінно"", легко д...",Sales,pre,0.0,
4,__,__,,,,intermediate,6.0,
10712,ᴀɴᴅʀᴏɪᴅ ᴅᴇᴠᴇʟᴏᴘᴇʀ,Driven with a passion for crafting innovativ...,,CERTIFICATES:\r\n- Diploma of Software Enginee...,Android,upper,2.0,
156864,,,,,,,0.0,
162373,,08.2022 - 12.2022: internship in Epam as DevOp...,,,Other,upper,0.0,
162374,,In progress,,,HR,intermediate,1.0,


In [29]:
df_candidates = df_candidates[~df_candidates['Position_cleaned'].isnull()]

## Candidates Looking For 

In [30]:
# Looking for 
df_candidates['Looking For'].value_counts().head(20)

Looking For
Профессиональный рост.                                                    1050
Профессиональный рост. Адекватный менеджмент.                              356
Профессиональный рост. Сложные задачи.                                     325
-                                                                          158
Возможность удаленной работы.                                               97
Профессиональный рост. Адекватный менеджмент. Сложные задачи.               96
Профессиональный рост. Интересный проект.                                   80
Профессиональный рост. Белая зарплата.                                      79
Адекватный менеджмент. Профессиональный рост.                               78
Профессиональный рост. Сложные задачи. Адекватный менеджмент.               76
Professional growth                                                         72
Профессиональный рост. Неформальная обстановка.                             71
Профессиональный рост                   

In [31]:
# Looking for 
df_candidates['Looking For'].value_counts().tail(20)

Looking For
Decrease lack of experience. Work on project with own ideas, especially in {network monitoring, data analysis, VR, embedded} domain. Not interested in any of Windows-pure proposals.                                                                                                                                                                                                                                                                                          1
Привлекает разработка игр                                                                                                                                                                                                                                                                                                                                                                                                                                                      1
Obtaining experience, further careers and personal increas

## Primary Keyword

In [32]:
df_candidates['Primary Keyword'].value_counts().head(20)

Primary Keyword
JavaScript          49520
QA                  32877
Design              21540
Java                16316
Project Manager     15655
PHP                 14651
.NET                13656
Marketing           13409
Python              10717
Other                8121
HR                   6853
QA Automation        6852
Recruiter            5928
Node.js              5838
Support              5710
Business Analyst     5620
DevOps               5311
C++                  5105
Sales                4933
Android              4786
Name: count, dtype: int64

In [33]:
df_candidates['Primary Keyword'].value_counts().tail(20)

Primary Keyword
iOS                  4569
Artist               3747
Sysadmin             3273
Unity                3098
Lead                 2986
Data Science         2977
Data Analyst         2579
Ruby                 2328
SQL                  1816
Golang               1646
SEO                  1350
Flutter              1294
Lead Generation      1010
Security              921
Data Engineer         876
Technical Writing     805
Scala                 448
Salesforce            425
Scrum Master          382
Rust                  184
Name: count, dtype: int64

In [34]:
# number of nulls in primary keyword
print('Number of nulls in primary keyword: ', df_candidates['Primary Keyword'].isnull().sum())

Number of nulls in primary keyword:  204


In [35]:
df_candidates[df_candidates['Primary Keyword'].isna()].head(10)

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned
297,2D animator/ Technical designer,2.5 years of 2d animation and integration of a...,My intention is to grow both artistically and ...,"Fast learning, eager to innovate.\r\n\r\nBroad...",,upper,2.5,2D animator Technical designer
367,2d artist,"character design, concept art, visual development",,,,fluent,5.0,2d artist
531,2d artist,Навыки работы с графическим планшетом.\r\nПроф...,Белая зарплата. Профессиональный рост.,Иллюстрации для Slumber Worlds\r\nХудожник ком...,,intermediate,0.5,2d artist
1552,3d artist,"3d modeling, 3ds max, texturing, 3d artist, lo...","Хочу развиваться дальше, улучшать навыки в сво...","Изучаю Autodesk Mudbox, скульптинг, моделирова...",,basic,2.0,3d artist
3294,3D modeler,"Autodesk Maya, Adobe Photoshop","Ищу стабильную работу, рассматриваю переезд.","Создание low-poly моделей согласно чертежам, ф...",,basic,1.0,3D modeler
3332,3d modeller,Для своих работ использую Maya и SketchBookPro...,Хочу найти работу в сфере game dev.,,,intermediate,0.0,3d modeller
3456,3d моделлер,"Autodesk Maya, Adobe Photoshop, uvLayout, Zbru...","Профессиональный рост, дружный коллектив","8 лет работал поваром, но понял что хочу заним...",,no english,0.0,3d моделлер
3464,3d модельер,"3d max, Zbrush, photoshop, cinema 4d, mudbox.",Повышения левела скилов и дружная компания. К...,"Быстро учусь, прошел много курсов по созданию ...",,intermediate,0.5,3d модельер
3497,ABAP разработчик,Опыт свыше 8 лет.\r\nРазработка под модули: SD...,,,,basic,7.0,ABAP разработчик
3503,Access,"База по Excel, Access, Word. Студент колледжа(...",Такого места пока что нету,"Работал ""менеджером"" настольных игор, проводил...",,,0.0,Access


## Moreinfo, Looking For, Highlights

In [39]:
df_candidates.head()

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned
5,"13 years of exp || Solidity, C#, JavaScript ||...",Who am I:\r\n- 13 years of commercial experien...,I am interested in:\r\n- part-time engagement;...,Landed a role of Director of Blockchain Develo...,Lead,fluent,11.0,13 years of exp Solidity C JavaScript CTO V...
6,1500,I have accumulated over a decade of experience...,,1. Boosting e-commerce sales for online food s...,Marketing,fluent,11.0,1500
7,1c,"Розробник 1с/BAS, всі українські конфігурації,...",,,Other,intermediate,11.0,1c
8,1C Architect,Опыт работы с «1С:Предприятие 8» — более 12 ле...,,Имею 50% в бизнес-проекте в Лондоне связанного...,Other,upper,11.0,1C Architect
9,1C-Bitrix разработчик,"Less, Bootstrap, Adobe Photoshop, JavaScript, ...",,,PHP,basic,7.0,1CBitrix разработчик


In [47]:
print(df_candidates.Moreinfo[5])

Who am I:
- 13 years of commercial experience as a software engineer (web projects, customers from Europe and US);
- 5 years in roles of team lead, tech lead, architect (including coding);
- constant learner (books, courses, youtube);
- C1 (Advanced) level of English (IELTS General = 7/9);
- tech languages: C#, JavaScript, Solidity;
- ready to learn Rust/Go.

What can I bring in:
- develop your web/blockchain project from gathering requirements stage to deployment and maintenance;
- build a team of highly qualified and responsible professionals;
- build processes or improve existing ones;
- design architectures, code features, perform code reviews and so on.

What can I technically (in short):
- С#: 12 years of exp; .Net Core, .Net 6, MS SQL, EF, Clean Architecture, anything related to web platforms;
- JavaScript: 7 years of exp; Node.js, React.js, Angular;
- Solidity: since March 2021; upgradeable, secure, metamorphic, ERC20, ERC721, ERC1155, EIP1967, diamonds, proxies, clones, beacon

In [42]:
print(df_candidates['Looking For'][5])

I am interested in:
- part-time engagement;
- blockchain projects (DeFi, NFT, Gaming, Metaverse);
- high salary (I apply to jobs paid $120+ per hour);
- roles of a researcher and/or a leader rather than just coder;
- ability to work remotely from Bali (with that said, I'm ok with visiting office during initial months to earn your respect and trust);
- freedom in taking decisions;
- time zone of Europe or Asia;
- 1-2 interviews as a max, with tech specialists and managers.

I am not interested in:
- US time zone (I'd prefer morning shifts starting at 7am rather than coding till midnight);
- strict "fabric" schedule like "9-to-6", "10-to-7", "11-to-8" and so on (since in IT this is an indicator of the unprofessional nature of the team and management);
- jobs with a requirement of overall experience in software development of just a few years (I have 12, and the job offered should be my next challenge);
- calls with HRs for "just to have a 30 minutes blah-blah-blah" (I have a prepared lis

In [45]:
print(df_candidates['Highlights'][5])

Landed a role of Director of Blockchain Development in a metaverse project in Feb, 2022


In [48]:
# create columns with name CV and Structure as Highlights+Moreinfo+Looking For
df_candidates['CV'] = df_candidates['Highlights'].fillna('') + '\n' + df_candidates['Moreinfo'].fillna('') + '\n' + df_candidates['Looking For'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_candidates['CV'] = df_candidates['Highlights'].fillna('') + '\n' + df_candidates['Moreinfo'].fillna('') + '\n' + df_candidates['Looking For'].fillna('')


In [49]:
df_candidates.head()

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV
5,"13 years of exp || Solidity, C#, JavaScript ||...",Who am I:\r\n- 13 years of commercial experien...,I am interested in:\r\n- part-time engagement;...,Landed a role of Director of Blockchain Develo...,Lead,fluent,11.0,13 years of exp Solidity C JavaScript CTO V...,Landed a role of Director of Blockchain Develo...
6,1500,I have accumulated over a decade of experience...,,1. Boosting e-commerce sales for online food s...,Marketing,fluent,11.0,1500,1. Boosting e-commerce sales for online food s...
7,1c,"Розробник 1с/BAS, всі українські конфігурації,...",,,Other,intermediate,11.0,1c,"\nРозробник 1с/BAS, всі українські конфігураці..."
8,1C Architect,Опыт работы с «1С:Предприятие 8» — более 12 ле...,,Имею 50% в бизнес-проекте в Лондоне связанного...,Other,upper,11.0,1C Architect,Имею 50% в бизнес-проекте в Лондоне связанного...
9,1C-Bitrix разработчик,"Less, Bootstrap, Adobe Photoshop, JavaScript, ...",,,PHP,basic,7.0,1CBitrix разработчик,"\nLess, Bootstrap, Adobe Photoshop, JavaScript..."


In [50]:
# number of nulls in CV
print('Number of nulls in CV: ', df_candidates['CV'].isnull().sum())

Number of nulls in CV:  0


In [51]:
# min, max andmean lean of CV
print('Min length of CV: ', df_candidates['CV'].str.len().min())
print('Max length of CV: ', df_candidates['CV'].str.len().max())
print('Mean length of CV: ', df_candidates['CV'].str.len().mean())

Min length of CV:  3
Max length of CV:  7372
Mean length of CV:  856.1896775505363


In [53]:
# quarties of CV length
df_candidates['CV'].str.len().quantile([0.05, 0.25, 0.5, 0.75, 0.95])

0.05     175.0
0.25     389.0
0.50     663.0
0.75    1123.0
0.95    2232.0
Name: CV, dtype: float64

In [58]:
# show all which length of CV less than 0.05 quantile
df_candidates[df_candidates['CV'].str.len() < df_candidates['CV'].str.len().quantile(0.05)]

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV
12,1c Developer,Worked on a mobile application for tracking trips,,,Other,intermediate,3.0,1c Developer,\nWorked on a mobile application for tracking ...
20,1C програміст,Шукаю роботу по напрямку junior С++. Маю досві...,"Цікаві проекти, саморозвиток.",,Data Engineer,intermediate,6.0,1C програмст,\nШукаю роботу по напрямку junior С++. Маю дос...
28,1С,Доработка и сопровождение существующих конфигу...,,,Other,intermediate,1.5,1С,\nДоработка и сопровождение существующих конфи...
31,1С BAS ERP - внедрение всех участков производс...,Более 20 лет опыта работы: внедрение BAS ERP /...,,,Project Manager,basic,11.0,1С BAS ERP внедрение всех участков производст...,\nБолее 20 лет опыта работы: внедрение BAS ERP...
39,1с Консультант (Админстратор 1с),"Есть опыт работы с конфигурациями ЗУП, УТ, БУХ...",,,Support,basic,2.0,1с Консультант Админстратор 1с,"\nЕсть опыт работы с конфигурациями ЗУП, УТ, Б..."
...,...,...,...,...,...,...,...,...,...
295039,Художник - ілюстратор,Є досвід роботи у видавництві над книжковими п...,,,Java,intermediate,1.0,Художник люстратор,\nЄ досвід роботи у видавництві над книжковими...
295049,щщщ,щрщ,,,,,0.0,щщщ,\nщрщ\n
295071,Юрист,Досвід роботи юристом 2 роки.\r\nВичитка догов...,,,Other,pre,2.0,Юрист,\nДосвід роботи юристом 2 роки.\r\nВичитка дог...
295078,Юрист,Опыт руководителя отдела правового сопровожден...,Интересный проект. Неформальная обстановка. Ад...,,Other,intermediate,9.0,Юрист,\nОпыт руководителя отдела правового сопровожд...


In [60]:
# show all which length of CV higher than than 0.95 quantile
df_candidates[df_candidates['CV'].str.len() > df_candidates['CV'].str.len().quantile(0.95)]

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV
5,"13 years of exp || Solidity, C#, JavaScript ||...",Who am I:\r\n- 13 years of commercial experien...,I am interested in:\r\n- part-time engagement;...,Landed a role of Director of Blockchain Develo...,Lead,fluent,11.0,13 years of exp Solidity C JavaScript CTO V...,Landed a role of Director of Blockchain Develo...
10,1c developer,Еко - маркет Торгівля роздрібна 1 рік 9 місяці...,,Hillel IT school Java Basic and Java Pro 2022\...,SQL,pre,11.0,1c developer,Hillel IT school Java Basic and Java Pro 2022\...
16,1C team lead / 1C Senior / 1C developer,Разработка и внедрение:\r\n\r\n12 лет программ...,"НЕ интересует работа аналитиком, консультантом...",- 5 лет работы руководителем направления разра...,Lead,pre,11.0,1C team lead 1C Senior 1C developer,- 5 лет работы руководителем направления разра...
19,1C програміст,Шукаю роботу з метою розвитку та вдосконалення...,В першу чергу мене цікавить можливість розвитк...,"Пройшов курси: «1С за 21 день, 1С 8.3» та «1С ...",SQL,intermediate,0.5,1C програмст,"Пройшов курси: «1С за 21 день, 1С 8.3» та «1С ..."
45,1С Програміст,"1С Програміст - 7 років у франчайзі, 1 рік - у...",,,Other,intermediate,8.0,1С Програмст,"\n1С Програміст - 7 років у франчайзі, 1 рік -..."
...,...,...,...,...,...,...,...,...,...
294880,Фахівець з інформаційної безпеки,Information Security Specialist (2021 to the p...,,,Security,upper,2.0,Фахвець з нформацйно безпеки,\nInformation Security Specialist (2021 to the...
294915,финансовый менеджер/ финансовый директор,Financial support and controlling function for...,I would like to receive interesting and challe...,Financial support and controlling function for...,Other,fluent,11.0,финансовый менеджер финансовый директор,Financial support and controlling function for...
294953,"Фінансовій аналітик, Аналітик, Ресечер, Маркет...",May 2019 – Present time\r\nMarketing Consultan...,Definitely I do not want to do useless work :)...,I won't tell about increasing sales rate for e...,Other,upper,11.0,Фнансовй аналтик Аналтик Ресечер Маркетолог ан...,I won't tell about increasing sales rate for e...
295057,Юрисконсульт/адвокат,Маю 15 років безперервної роботи в галузі прав...,"Прагну займатися улюбленою справою, приймати у...",1. Розробка і втілення в життя правової схеми ...,Other,intermediate,11.0,Юрисконсультадвокат,1. Розробка і втілення в життя правової схеми ...


In [64]:
df_candidates = df_candidates[df_candidates['CV'].str.len() >= df_candidates['CV'].str.len().quantile(0.05)]

In [65]:
len(df_candidates)

280372

In [66]:
# length unique CV
len(df_candidates['CV'].unique())

280067

In [67]:
# show column duplicates
df_candidates[df_candidates['CV'].duplicated(keep=False)].sort_values('CV').head(20)

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV
279743,Разработчик C#,"""Lorem ipsum dolor sit amet, consectetur adipi...",,,.NET,upper,2.0,Разработчик C,"\n""Lorem ipsum dolor sit amet, consectetur adi..."
178873,Project Manager,"""Lorem ipsum dolor sit amet, consectetur adipi...",,,Project Manager,intermediate,6.0,Project Manager,"\n""Lorem ipsum dolor sit amet, consectetur adi..."
232855,Senior Software Engineer,* Applying Python to extend functionality of c...,,,Python,fluent,11.0,Senior Software Engineer,\n* Applying Python to extend functionality of...
232854,Senior Software Engineer,* Applying Python to extend functionality of c...,,,Python,fluent,6.0,Senior Software Engineer,\n* Applying Python to extend functionality of...
76627,HR Assistant / IT Recruiter,"- Full cycle of recruitment (searching, prescr...",,,HR,intermediate,4.0,HR Assistant IT Recruiter,"\n- Full cycle of recruitment (searching, pres..."
78206,HR manager,"- Full cycle of recruitment (searching, prescr...",,,HR,intermediate,4.0,HR manager,"\n- Full cycle of recruitment (searching, pres..."
9793,Automation QA Engineer,- Implementing and improvements UI and API aut...,,,QA Automation,upper,5.0,Automation QA Engineer,\n- Implementing and improvements UI and API a...
10372,Automation QA Engineer (Java),- Implementing and improvements UI and API aut...,,,QA Automation,upper,4.0,Automation QA Engineer Java,\n- Implementing and improvements UI and API a...
249405,Team Lead/Tech lead/Engineering manager,- Over 20 years in IT including 11 years in co...,Big interesting project with modern and wide t...,,PHP,intermediate,11.0,Team LeadTech leadEngineering manager,\n- Over 20 years in IT including 11 years in ...
248637,Team Lead/Engineering manager,- Over 20 years in IT including 11 years in co...,Big interesting project with modern and wide t...,,PHP,intermediate,11.0,Team LeadEngineering manager,\n- Over 20 years in IT including 11 years in ...


# Jobs

- 

## Preprocess Logic:
- 