In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Candidates

- Position - manual written position title
- Moreinfo - manual candidate's info
- Looking For - manual written what people search (not useful info for us)
- Highlights - manual written candidate's highlights
- Primary Keyword - job profile type
- English Level - candidate's English level
- Experience Years - candidate's experience in years

## Preprocess Logic:
- Drop duplicates
- Drop rows with empty Position after cleaning
- Drop 0.05 quantile of CV length
- Drop CV duplicates
- Filter by languages: only Ukraine and English
- Filter by embedding similarity

In [2]:
df_candidates = pd.read_csv('../data/djinni/candidates.csv')

In [3]:
df_candidates.head()

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years
0,=,Маю досвід роботи рекрутером більше 1 року. \r...,,"Пройдений курс ""Introduction to Machine Learni...",Marketing,upper,0.0
1,_,_____,_,_,Other,upper,11.0
2,_,Studied EPAM 'IT switch course' 2022 which inc...,,,Java,intermediate,0.0
3,_,Наразі досвід є лише в якихось університетськи...,,"Навчаюся в університеті на ""відмінно"", легко д...",Sales,pre,0.0
4,__,__,,,,intermediate,6.0


In [4]:
df_candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295094 entries, 0 to 295093
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Position          295091 non-null  object 
 1   Moreinfo          295093 non-null  object 
 2   Looking For       150551 non-null  object 
 3   Highlights        148498 non-null  object 
 4   Primary Keyword   294888 non-null  object 
 5   English Level     291682 non-null  object 
 6   Experience Years  295094 non-null  float64
dtypes: float64(1), object(6)
memory usage: 15.8+ MB


## Possible candidates

In [5]:
df_candidates['Position'].value_counts().head(20)

Position
QA Engineer                   8987
Project Manager               8415
Front-end developer           7142
Junior QA Engineer            6925
Junior Front-end Developer    5076
Java Developer                4754
UI/UX Designer                4649
Manual QA Engineer            4364
Front-End Developer           4134
UX/UI Designer                3318
IT Recruiter                  3314
Разработчик PHP               2862
Python Developer              2739
Frontend Developer            2676
Business Analyst              2495
PHP Developer                 2423
Full Stack Web Developer      2253
iOS Developer                 2092
HR manager                    2059
Graphic Designer              2045
Name: count, dtype: int64

In [6]:
df_candidates['Position'].value_counts().tail(20)

Position
IT Recruitment Consultant                             1
IT Recruitment | Freelance                            1
IT Recruitment Lead                                   1
IT Recruitment Lead, HRM, HRD                         1
IT Recruitment Manager                                1
IT-Recruter                                           1
IT Recruitment Manager or IT Recruitmen Consultant    1
IT Recruitment/PowerBi analyst                        1
IT Recruitment researcher                             1
IT recruitment researcher, HR                         1
IT Recruitment Researcher, People Partner             1
IT Recruitment Researcher/Recruiter                   1
IT Recruitment Researcher (Technical Sourcer)         1
IT Recruitment Sourcer                                1
IT RECRUITMENT SPECIALIST                             1
IT Recruitment specialist/HR                          1
IT Recruitment Team Lead                              1
IT Recruteir/Researcher                

In [7]:
# number of unique positions
print('Number of unique positions: ', len(df_candidates['Position'].unique()))

Number of unique positions:  70646


In [8]:
# quartiles of positions count
df_candidates['Position'].value_counts().quantile([0.05,0.25, 0.5, 0.75, 0.95])

0.05    1.0
0.25    1.0
0.50    1.0
0.75    1.0
0.95    4.0
Name: count, dtype: float64

In [9]:
# clean all possible symbols from positions
df_candidates['Position_cleaned'] = df_candidates['Position'].str.replace('[^a-zA-Zа-яА-Я ]', '', regex=True).str.strip()

# empty positions equal to None
df_candidates['Position_cleaned'] = df_candidates['Position_cleaned'].replace('', None)

# number of unique positions
print('Number of unique positions: ', len(df_candidates['Position_cleaned'].unique()))

# number of empty positions
print('Number of empty positions: ', len(df_candidates[df_candidates['Position_cleaned'].isnull()]))

# show empty positions
df_candidates[df_candidates['Position_cleaned'].isnull()]

Number of unique positions:  66960
Number of empty positions:  15


Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned
0,=,Маю досвід роботи рекрутером більше 1 року. \r...,,"Пройдений курс ""Introduction to Machine Learni...",Marketing,upper,0.0,
1,_,_____,_,_,Other,upper,11.0,
2,_,Studied EPAM 'IT switch course' 2022 which inc...,,,Java,intermediate,0.0,
3,_,Наразі досвід є лише в якихось університетськи...,,"Навчаюся в університеті на ""відмінно"", легко д...",Sales,pre,0.0,
4,__,__,,,,intermediate,6.0,
6,1500,I have accumulated over a decade of experience...,,1. Boosting e-commerce sales for online food s...,Marketing,fluent,11.0,
69,2,0rwestrdytfhukjweeeeeeeeeeeeeeei6s7urtkgchugyf...,,I am studying programmer and study english lan...,Other,pre,0.5,
3473,4000,"HTML, CSS, QA, Agile, Scrum, Kanban, Waterfall...","Looking for good team, interesting, complicate...",PM:\r\n- Complicated web-platforms and e-comme...,Project Manager,upper,6.0,
3474,489498489489,Kyiv Academy of Media Arts - successfully grad...,,,Marketing,fluent,0.5,
3480,7000,Position: Java developer (full stack)\r\nCusto...,,,Java,fluent,11.0,


In [10]:
df_candidates = df_candidates[~df_candidates['Position_cleaned'].isnull()]

## Candidates Looking For 

In [11]:
# Looking for 
df_candidates['Looking For'].value_counts().head(20)

Looking For
Профессиональный рост.                                                    1050
Профессиональный рост. Адекватный менеджмент.                              356
Профессиональный рост. Сложные задачи.                                     325
-                                                                          158
Возможность удаленной работы.                                               97
Профессиональный рост. Адекватный менеджмент. Сложные задачи.               96
Профессиональный рост. Интересный проект.                                   80
Профессиональный рост. Белая зарплата.                                      79
Адекватный менеджмент. Профессиональный рост.                               78
Профессиональный рост. Сложные задачи. Адекватный менеджмент.               76
Professional growth                                                         72
Профессиональный рост. Неформальная обстановка.                             71
Профессиональный рост                   

In [12]:
# Looking for 
df_candidates['Looking For'].value_counts().tail(20)

Looking For
Получение практического опыта разработки проекта\r\nПрофессиональный рост. Сложные задачи.                                                                                                                                                                                                                                                                                                                                                                                     1
Decrease lack of experience. Work on project with own ideas, especially in {network monitoring, data analysis, VR, embedded} domain. Not interested in any of Windows-pure proposals.                                                                                                                                                                                                                                                                                          1
Привлекает разработка игр                                 

## Primary Keyword

In [13]:
df_candidates['Primary Keyword'].value_counts().head(20)

Primary Keyword
JavaScript          49520
QA                  32877
Design              21540
Java                16315
Project Manager     15654
PHP                 14651
.NET                13656
Marketing           13407
Python              10717
Other                8120
HR                   6853
QA Automation        6852
Recruiter            5928
Node.js              5838
Support              5709
Business Analyst     5620
DevOps               5311
C++                  5105
Sales                4933
Android              4786
Name: count, dtype: int64

In [14]:
df_candidates['Primary Keyword'].value_counts().tail(20)

Primary Keyword
iOS                  4569
Artist               3747
Sysadmin             3273
Unity                3098
Lead                 2986
Data Science         2977
Data Analyst         2579
Ruby                 2328
SQL                  1816
Golang               1646
SEO                  1350
Flutter              1294
Lead Generation      1010
Security              921
Data Engineer         876
Technical Writing     805
Scala                 448
Salesforce            425
Scrum Master          382
Rust                  184
Name: count, dtype: int64

In [15]:
# number of nulls in primary keyword
print('Number of nulls in primary keyword: ', df_candidates['Primary Keyword'].isnull().sum())

Number of nulls in primary keyword:  204


In [16]:
df_candidates[df_candidates['Primary Keyword'].isna()].head(10)

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned
297,2D animator/ Technical designer,2.5 years of 2d animation and integration of a...,My intention is to grow both artistically and ...,"Fast learning, eager to innovate.\r\n\r\nBroad...",,upper,2.5,D animator Technical designer
367,2d artist,"character design, concept art, visual development",,,,fluent,5.0,d artist
531,2d artist,Навыки работы с графическим планшетом.\r\nПроф...,Белая зарплата. Профессиональный рост.,Иллюстрации для Slumber Worlds\r\nХудожник ком...,,intermediate,0.5,d artist
1552,3d artist,"3d modeling, 3ds max, texturing, 3d artist, lo...","Хочу развиваться дальше, улучшать навыки в сво...","Изучаю Autodesk Mudbox, скульптинг, моделирова...",,basic,2.0,d artist
3294,3D modeler,"Autodesk Maya, Adobe Photoshop","Ищу стабильную работу, рассматриваю переезд.","Создание low-poly моделей согласно чертежам, ф...",,basic,1.0,D modeler
3332,3d modeller,Для своих работ использую Maya и SketchBookPro...,Хочу найти работу в сфере game dev.,,,intermediate,0.0,d modeller
3456,3d моделлер,"Autodesk Maya, Adobe Photoshop, uvLayout, Zbru...","Профессиональный рост, дружный коллектив","8 лет работал поваром, но понял что хочу заним...",,no english,0.0,d моделлер
3464,3d модельер,"3d max, Zbrush, photoshop, cinema 4d, mudbox.",Повышения левела скилов и дружная компания. К...,"Быстро учусь, прошел много курсов по созданию ...",,intermediate,0.5,d модельер
3497,ABAP разработчик,Опыт свыше 8 лет.\r\nРазработка под модули: SD...,,,,basic,7.0,ABAP разработчик
3503,Access,"База по Excel, Access, Word. Студент колледжа(...",Такого места пока что нету,"Работал ""менеджером"" настольных игор, проводил...",,,0.0,Access


## Moreinfo, Looking For, Highlights

In [17]:
df_candidates.head()

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned
5,"13 years of exp || Solidity, C#, JavaScript ||...",Who am I:\r\n- 13 years of commercial experien...,I am interested in:\r\n- part-time engagement;...,Landed a role of Director of Blockchain Develo...,Lead,fluent,11.0,years of exp Solidity C JavaScript CTO VP ...
7,1c,"Розробник 1с/BAS, всі українські конфігурації,...",,,Other,intermediate,11.0,c
8,1C Architect,Опыт работы с «1С:Предприятие 8» — более 12 ле...,,Имею 50% в бизнес-проекте в Лондоне связанного...,Other,upper,11.0,C Architect
9,1C-Bitrix разработчик,"Less, Bootstrap, Adobe Photoshop, JavaScript, ...",,,PHP,basic,7.0,CBitrix разработчик
10,1c developer,Еко - маркет Торгівля роздрібна 1 рік 9 місяці...,,Hillel IT school Java Basic and Java Pro 2022\...,SQL,pre,11.0,c developer


In [18]:
print(df_candidates.Moreinfo[5])

Who am I:
- 13 years of commercial experience as a software engineer (web projects, customers from Europe and US);
- 5 years in roles of team lead, tech lead, architect (including coding);
- constant learner (books, courses, youtube);
- C1 (Advanced) level of English (IELTS General = 7/9);
- tech languages: C#, JavaScript, Solidity;
- ready to learn Rust/Go.

What can I bring in:
- develop your web/blockchain project from gathering requirements stage to deployment and maintenance;
- build a team of highly qualified and responsible professionals;
- build processes or improve existing ones;
- design architectures, code features, perform code reviews and so on.

What can I technically (in short):
- С#: 12 years of exp; .Net Core, .Net 6, MS SQL, EF, Clean Architecture, anything related to web platforms;
- JavaScript: 7 years of exp; Node.js, React.js, Angular;
- Solidity: since March 2021; upgradeable, secure, metamorphic, ERC20, ERC721, ERC1155, EIP1967, diamonds, proxies, clones, beacon

In [19]:
print(df_candidates['Looking For'][5])

I am interested in:
- part-time engagement;
- blockchain projects (DeFi, NFT, Gaming, Metaverse);
- high salary (I apply to jobs paid $120+ per hour);
- roles of a researcher and/or a leader rather than just coder;
- ability to work remotely from Bali (with that said, I'm ok with visiting office during initial months to earn your respect and trust);
- freedom in taking decisions;
- time zone of Europe or Asia;
- 1-2 interviews as a max, with tech specialists and managers.

I am not interested in:
- US time zone (I'd prefer morning shifts starting at 7am rather than coding till midnight);
- strict "fabric" schedule like "9-to-6", "10-to-7", "11-to-8" and so on (since in IT this is an indicator of the unprofessional nature of the team and management);
- jobs with a requirement of overall experience in software development of just a few years (I have 12, and the job offered should be my next challenge);
- calls with HRs for "just to have a 30 minutes blah-blah-blah" (I have a prepared lis

In [20]:
print(df_candidates['Highlights'][5])

Landed a role of Director of Blockchain Development in a metaverse project in Feb, 2022


In [21]:
# create columns with name CV and Structure as Highlights+Moreinfo+Looking For
df_candidates['CV'] = df_candidates['Highlights'].fillna('') + '\n' + df_candidates['Moreinfo'].fillna('') + '\n' + df_candidates['Looking For'].fillna('')

In [22]:
df_candidates.head()

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV
5,"13 years of exp || Solidity, C#, JavaScript ||...",Who am I:\r\n- 13 years of commercial experien...,I am interested in:\r\n- part-time engagement;...,Landed a role of Director of Blockchain Develo...,Lead,fluent,11.0,years of exp Solidity C JavaScript CTO VP ...,Landed a role of Director of Blockchain Develo...
7,1c,"Розробник 1с/BAS, всі українські конфігурації,...",,,Other,intermediate,11.0,c,"\nРозробник 1с/BAS, всі українські конфігураці..."
8,1C Architect,Опыт работы с «1С:Предприятие 8» — более 12 ле...,,Имею 50% в бизнес-проекте в Лондоне связанного...,Other,upper,11.0,C Architect,Имею 50% в бизнес-проекте в Лондоне связанного...
9,1C-Bitrix разработчик,"Less, Bootstrap, Adobe Photoshop, JavaScript, ...",,,PHP,basic,7.0,CBitrix разработчик,"\nLess, Bootstrap, Adobe Photoshop, JavaScript..."
10,1c developer,Еко - маркет Торгівля роздрібна 1 рік 9 місяці...,,Hillel IT school Java Basic and Java Pro 2022\...,SQL,pre,11.0,c developer,Hillel IT school Java Basic and Java Pro 2022\...


In [23]:
# number of nulls in CV
print('Number of nulls in CV: ', df_candidates['CV'].isnull().sum())

Number of nulls in CV:  0


In [24]:
# min, max andmean lean of CV
print('Min length of CV: ', df_candidates['CV'].str.len().min())
print('Max length of CV: ', df_candidates['CV'].str.len().max())
print('Mean length of CV: ', df_candidates['CV'].str.len().mean())

Min length of CV:  3
Max length of CV:  7372
Mean length of CV:  856.1899491322663


In [25]:
# quarties of CV length
df_candidates['CV'].str.len().quantile([0.05, 0.25, 0.5, 0.75, 0.95])

0.05     175.0
0.25     389.0
0.50     663.0
0.75    1123.0
0.95    2232.0
Name: CV, dtype: float64

In [26]:
# show all which length of CV less than 0.05 quantile
df_candidates[df_candidates['CV'].str.len() < df_candidates['CV'].str.len().quantile(0.05)]

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV
12,1c Developer,Worked on a mobile application for tracking trips,,,Other,intermediate,3.0,c Developer,\nWorked on a mobile application for tracking ...
20,1C програміст,Шукаю роботу по напрямку junior С++. Маю досві...,"Цікаві проекти, саморозвиток.",,Data Engineer,intermediate,6.0,C програмст,\nШукаю роботу по напрямку junior С++. Маю дос...
28,1С,Доработка и сопровождение существующих конфигу...,,,Other,intermediate,1.5,С,\nДоработка и сопровождение существующих конфи...
31,1С BAS ERP - внедрение всех участков производс...,Более 20 лет опыта работы: внедрение BAS ERP /...,,,Project Manager,basic,11.0,С BAS ERP внедрение всех участков производств...,\nБолее 20 лет опыта работы: внедрение BAS ERP...
39,1с Консультант (Админстратор 1с),"Есть опыт работы с конфигурациями ЗУП, УТ, БУХ...",,,Support,basic,2.0,с Консультант Админстратор с,"\nЕсть опыт работы с конфигурациями ЗУП, УТ, Б..."
...,...,...,...,...,...,...,...,...,...
295039,Художник - ілюстратор,Є досвід роботи у видавництві над книжковими п...,,,Java,intermediate,1.0,Художник люстратор,\nЄ досвід роботи у видавництві над книжковими...
295049,щщщ,щрщ,,,,,0.0,щщщ,\nщрщ\n
295071,Юрист,Досвід роботи юристом 2 роки.\r\nВичитка догов...,,,Other,pre,2.0,Юрист,\nДосвід роботи юристом 2 роки.\r\nВичитка дог...
295078,Юрист,Опыт руководителя отдела правового сопровожден...,Интересный проект. Неформальная обстановка. Ад...,,Other,intermediate,9.0,Юрист,\nОпыт руководителя отдела правового сопровожд...


In [27]:
# show all which length of CV higher than than 0.95 quantile
df_candidates[df_candidates['CV'].str.len() > df_candidates['CV'].str.len().quantile(0.95)]

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV
5,"13 years of exp || Solidity, C#, JavaScript ||...",Who am I:\r\n- 13 years of commercial experien...,I am interested in:\r\n- part-time engagement;...,Landed a role of Director of Blockchain Develo...,Lead,fluent,11.0,years of exp Solidity C JavaScript CTO VP ...,Landed a role of Director of Blockchain Develo...
10,1c developer,Еко - маркет Торгівля роздрібна 1 рік 9 місяці...,,Hillel IT school Java Basic and Java Pro 2022\...,SQL,pre,11.0,c developer,Hillel IT school Java Basic and Java Pro 2022\...
16,1C team lead / 1C Senior / 1C developer,Разработка и внедрение:\r\n\r\n12 лет программ...,"НЕ интересует работа аналитиком, консультантом...",- 5 лет работы руководителем направления разра...,Lead,pre,11.0,C team lead C Senior C developer,- 5 лет работы руководителем направления разра...
19,1C програміст,Шукаю роботу з метою розвитку та вдосконалення...,В першу чергу мене цікавить можливість розвитк...,"Пройшов курси: «1С за 21 день, 1С 8.3» та «1С ...",SQL,intermediate,0.5,C програмст,"Пройшов курси: «1С за 21 день, 1С 8.3» та «1С ..."
45,1С Програміст,"1С Програміст - 7 років у франчайзі, 1 рік - у...",,,Other,intermediate,8.0,С Програмст,"\n1С Програміст - 7 років у франчайзі, 1 рік -..."
...,...,...,...,...,...,...,...,...,...
294880,Фахівець з інформаційної безпеки,Information Security Specialist (2021 to the p...,,,Security,upper,2.0,Фахвець з нформацйно безпеки,\nInformation Security Specialist (2021 to the...
294915,финансовый менеджер/ финансовый директор,Financial support and controlling function for...,I would like to receive interesting and challe...,Financial support and controlling function for...,Other,fluent,11.0,финансовый менеджер финансовый директор,Financial support and controlling function for...
294953,"Фінансовій аналітик, Аналітик, Ресечер, Маркет...",May 2019 – Present time\r\nMarketing Consultan...,Definitely I do not want to do useless work :)...,I won't tell about increasing sales rate for e...,Other,upper,11.0,Фнансовй аналтик Аналтик Ресечер Маркетолог ан...,I won't tell about increasing sales rate for e...
295057,Юрисконсульт/адвокат,Маю 15 років безперервної роботи в галузі прав...,"Прагну займатися улюбленою справою, приймати у...",1. Розробка і втілення в життя правової схеми ...,Other,intermediate,11.0,Юрисконсультадвокат,1. Розробка і втілення в життя правової схеми ...


In [28]:
df_candidates = df_candidates[df_candidates['CV'].str.len() >= df_candidates['CV'].str.len().quantile(0.05)]

In [29]:
len(df_candidates)

280366

In [30]:
# length unique CV
len(df_candidates['CV'].unique())

280061

In [31]:
# show column duplicates
df_candidates[df_candidates['CV'].duplicated(keep=False)].sort_values('CV').head(20)

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV
279743,Разработчик C#,"""Lorem ipsum dolor sit amet, consectetur adipi...",,,.NET,upper,2.0,Разработчик C,"\n""Lorem ipsum dolor sit amet, consectetur adi..."
178873,Project Manager,"""Lorem ipsum dolor sit amet, consectetur adipi...",,,Project Manager,intermediate,6.0,Project Manager,"\n""Lorem ipsum dolor sit amet, consectetur adi..."
232855,Senior Software Engineer,* Applying Python to extend functionality of c...,,,Python,fluent,11.0,Senior Software Engineer,\n* Applying Python to extend functionality of...
232854,Senior Software Engineer,* Applying Python to extend functionality of c...,,,Python,fluent,6.0,Senior Software Engineer,\n* Applying Python to extend functionality of...
76627,HR Assistant / IT Recruiter,"- Full cycle of recruitment (searching, prescr...",,,HR,intermediate,4.0,HR Assistant IT Recruiter,"\n- Full cycle of recruitment (searching, pres..."
78206,HR manager,"- Full cycle of recruitment (searching, prescr...",,,HR,intermediate,4.0,HR manager,"\n- Full cycle of recruitment (searching, pres..."
9793,Automation QA Engineer,- Implementing and improvements UI and API aut...,,,QA Automation,upper,5.0,Automation QA Engineer,\n- Implementing and improvements UI and API a...
10372,Automation QA Engineer (Java),- Implementing and improvements UI and API aut...,,,QA Automation,upper,4.0,Automation QA Engineer Java,\n- Implementing and improvements UI and API a...
249405,Team Lead/Tech lead/Engineering manager,- Over 20 years in IT including 11 years in co...,Big interesting project with modern and wide t...,,PHP,intermediate,11.0,Team LeadTech leadEngineering manager,\n- Over 20 years in IT including 11 years in ...
248637,Team Lead/Engineering manager,- Over 20 years in IT including 11 years in co...,Big interesting project with modern and wide t...,,PHP,intermediate,11.0,Team LeadEngineering manager,\n- Over 20 years in IT including 11 years in ...


### Embedding duplication filtering

In [32]:
from src.helpers import concurrent_processor
from src.lang_detector import lang_detection_func

  from .autonotebook import tqdm as notebook_tqdm
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [33]:
# get 10% samples for testing
df_candidates_test = df_candidates.sample(frac=0.1, random_state=42)
df_candidates_test.shape

(28037, 9)

In [34]:
# language detection
column_name = 'CV'

lang_detect_df = pd.DataFrame(concurrent_processor(df_candidates_test[column_name].unique(), lang_detection_func))
lang_detect_df = lang_detect_df.rename(columns={
    'item': column_name,
    'lang_detect': f'{column_name}_lang'
})
df_candidates_test = df_candidates_test.merge(lang_detect_df, on=column_name, how='left')

2023-09-30 11:29:39,245 - INFO - Processing items...
  0%|          | 0/28031 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max lengt

In [36]:
# checkpoint
df_candidates_test.to_csv('data/test_candidates_with_lang.csv', index=False)

In [38]:
df_candidates_test = pd.read_csv('data/test_candidates_with_lang.csv')

In [40]:
df_candidates_test.head()

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,Position_cleaned,CV,CV_lang
0,Senior Java Developer,Software engineer with 7 years of commercial e...,I enjoy to work with backend and application d...,Work on performance issues and concurrency pro...,Java,upper,7.0,Senior Java Developer,Work on performance issues and concurrency pro...,en
1,Front-end developer,Elbrus-bootcamp\r\nПрограммист-стажер\r\nВ Эль...,Профессиональный рост ...,Постигаю js ),JavaScript,basic,1.0,Frontend developer,Постигаю js )\nElbrus-bootcamp\r\nПрограммист-...,ru
2,Разработчик PHP,"PHP, MySQL, JavaScript","Загрузка на полдня (для начала, потом будет ви...","Образование - математик, Новосибирский гос. ун...",PHP,pre,0.5,Разработчик PHP,"Образование - математик, Новосибирский гос. ун...",ru
3,Software Engineer,T-Systems International GmbH Project: Telekom ...,"Creating server-based application, work with S...","Certified SAFe® 4 Practitioner, Oracle Certifi...",Java,upper,2.0,Software Engineer,"Certified SAFe® 4 Practitioner, Oracle Certifi...",en
4,Junior QA Engineer,I have recently completed a QA Engineer course...,"As I'm relatively new to this field, I welcome...",,QA,intermediate,0.5,Junior QA Engineer,\nI have recently completed a QA Engineer cour...,en


In [41]:
# persantage of not uk and en languages
df_candidates_test[f'{column_name}_lang'].value_counts(normalize=True)

CV_lang
en    0.759746
ru    0.143097
uk    0.085137
he    0.005029
tr    0.003495
pl    0.000963
pt    0.000535
ca    0.000392
mt    0.000285
sv    0.000250
da    0.000178
la    0.000143
nl    0.000143
et    0.000107
zh    0.000107
it    0.000107
fr    0.000071
cs    0.000071
se    0.000036
ar    0.000036
de    0.000036
sl    0.000036
Name: proportion, dtype: float64

In [42]:
# filter not ua and en languages
df_candidates_test = df_candidates_test[df_candidates_test['CV_lang'].isin(['uk', 'en'])]

In [43]:
# perseantage of ua and en languages
df_candidates_test[f'{column_name}_lang'].value_counts(normalize=True)

CV_lang
en    0.899232
uk    0.100768
Name: proportion, dtype: float64

#### English embedding check

In [71]:
df_canidates_test_en = df_candidates_test[df_candidates_test['CV_lang'] == 'en']
df_canidates_test_en.shape

(21301, 10)

In [51]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-base-en-v1.5') # use base to increase computation speed due to lack of computational resources

2023-09-30 13:40:24,830 - INFO - Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5
2023-09-30 13:40:25,998 - INFO - Use pytorch device: cpu


In [79]:
# syntatic similar CVs
cv = [
    "\nI have 9 years of experience building web applications.\r\n\r\nIn my last position I worked with Node.js, JavaScript/TypeScript and NestJS in cloud environments (AWS) and microservices. During that time, I implemented solutions that helped to increase the traffic of the web platform INX One, while working closely with the DevOps department to deploy new versions.\r\n\r\nI also made use of tools like Elasticsearch for logging the flow of features and catching errors on production, and Puppeteer, to automate the QA process in the deployment pipeline.\n",
    "\nI have 9 years of experience building web applications.\r\n\r\nIn my last position I worked with Node.js, JavaScript/TypeScript and NestJS in cloud environments (AWS) and microservices. During that time, I implemented solutions that helped to increase the traffic of the web platform INX One, while working closely with the DevOps department to deploy new versions.\r\n\r\nI also made use of tools like Elasticsearch for logging the flow of features and catching errors on  production, and Puppeteer, to automate the QA process in the deployment pipeline.\n",
    "In my last position I worked with Node.js, JavaScript and NestJS in cloud environments (AWS, GCP) and microservices. During that time, I implemented solutions that helped to increase the traffic of the web platform INX One, while working closely with the DevOps department to deploy new versions.\r\n\r\nI also made use of tools like Elasticsearch for logging the flow of features and catching errors on production, and Puppeteer, to automate the QA process in the deployment pipeline.\n",
    "In my last position I worked with Node.js, JavaScript and NestJS in cloud environments (AWS, GCP) and microservices. During that time, I implemented solutions that helped to increase the traffic of the web platform INX One, while working closely with the DevOps department to deploy new versions.\r\n\r\nI also made use of tools like Elasticsearch for logging the flow of features and catching errors on production, and Puppeteer, to automate the QC process in the pipeline.\n",
]

In [80]:
embeddings = model.encode(cv, show_progress_bar=True)
embeddings @ embeddings.T

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]


array([[1.0000002 , 1.0000002 , 0.95259833, 0.9312769 ],
       [1.0000002 , 1.0000002 , 0.95259833, 0.9312769 ],
       [0.95259833, 0.95259833, 0.9999998 , 0.97855633],
       [0.9312769 , 0.9312769 , 0.97855633, 1.        ]], dtype=float32)

In [68]:
# calculate embeddings for all CVs
embeddings = model.encode(
    df_canidates_test_en[column_name].tolist(), 
    show_progress_bar=True, 
    device="cuda" if torch.cuda.is_available() else "cpu", 
    normalize_embeddings=True
)

Batches: 100%|██████████| 4/4 [02:06<00:00, 31.54s/it]


In [69]:
df_canidates_test_en.reset_index(drop=True, inplace=True)
# threshold for similarity. If similarity between two CVs higher than threshold, then we consider that they are similar.
# Use 0.9 because it help to mostly filter all similar refrases CVs from one person.
threshold = 0.9 
i = 0
while i < len(df_canidates_test_en):
    scores = embeddings[i] @ embeddings.T
    indexes = np.where(scores >= threshold)[0]

    indexes = indexes[indexes != i]
    if indexes.tolist():
        indexes = np.unique(indexes)
        df_canidates_test_en.drop(indexes, inplace=True)
        df_canidates_test_en.reset_index(drop=True, inplace=True)
        embeddings = np.delete(embeddings, indexes, axis=0)
    i += 1

df_canidates_test_en.shape

(100, 10)

#### Ukraine embedding check

In [98]:
df_canidates_test_uk_main = df_candidates_test[df_candidates_test['CV_lang'] == 'uk'].sample(500)
df_canidates_test_uk_main.shape

(500, 10)

In [82]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-large')

2023-09-30 15:12:17,863 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large
Downloading (…)9f719/.gitattributes: 100%|██████████| 1.63k/1.63k [00:00<00:00, 2.75MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 201/201 [00:00<00:00, 647kB/s]
Downloading (…)316e29f719/README.md: 100%|██████████| 160k/160k [00:00<00:00, 842kB/s]
Downloading (…)6e29f719/config.json: 100%|██████████| 690/690 [00:00<00:00, 1.02MB/s]
Downloading model.safetensors: 100%|██████████| 2.24G/2.24G [03:52<00:00, 9.64MB/s]
Downloading (…)719/onnx/config.json: 100%|██████████| 688/688 [00:00<00:00, 566kB/s]
Downloading model.onnx: 100%|██████████| 546k/546k [00:00<00:00, 784kB/s]
Downloading model.onnx_data: 100%|██████████| 2.24G/2.24G [04:45<00:00, 7.83MB/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 11.5MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 280/280 [00:00<00:00, 556kB/s]
Downloading tokenizer.json: 100%|██████████| 17.1M/

In [99]:
# create similarity matrix
embeddings_main = model.encode(
    df_canidates_test_uk_main[column_name].tolist(), 
    show_progress_bar=True, 
    device="cuda" if torch.cuda.is_available() else "cpu", 
    normalize_embeddings=True
)

Batches: 100%|██████████| 16/16 [31:03<00:00, 116.44s/it]


In [100]:
embeddings = embeddings_main.copy()
df_canidates_test_uk = df_canidates_test_uk_main.copy()

In [103]:
df_canidates_test_uk.reset_index(drop=True, inplace=True)
# threshold for similarity. If similarity between two CVs higher than threshold, then we consider that they are similar.
# Use 0.95 because it help to mostly filter all similar refrases CVs from one person.
# For ukrainian language it is better to use higher threshold because of multilingual model which not so good as specialize model for specific language.
threshold = 0.95 
i = 0
while i < len(df_canidates_test_uk):
    scores = embeddings[i] @ embeddings.T
    indexes = np.where(scores >= threshold)[0]

    indexes = indexes[indexes != i]
    if indexes.tolist():
        print("Query", i, ":", df_canidates_test_uk[column_name][i])
        print("Similar CVs: ")
        for index in indexes:
            print(df_canidates_test_uk[column_name][index])
        print("========================================")
        indexes = np.unique(indexes)
        df_canidates_test_uk.drop(indexes, inplace=True)
        df_canidates_test_uk.reset_index(drop=True, inplace=True)
        embeddings = np.delete(embeddings, indexes, axis=0)
    i += 1

df_canidates_test_uk.shape

(328, 10)

# Jobs

- Position - manual written position title
- Long Description - manual written job description
- Company Name - company name which hires
- Exp Years - experience years
- Primary Keyword - job profile type
- English Level - English level needed for this job
- Published - date when job was published

## Preprocess Logic:
- Drop duplicates
- Drop empty long description
- Drop 0.05 quantile of long description length (really short descriptions)
- Drop Long Description duplicates
- Drop rows with empty Company Name (to have more accurate list of jobs)
- Filter by languages: only Ukraine and English
- Filter by embedding similarity (use the same logic as for candidates)

In [104]:
jobs = pd.read_csv('../data/djinni/jobs.csv')
jobs.head()

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published
0,104.ua приглашает в команду разработки Платфор...,104.ua приглашает в команду разработки Платфор...,Региональная газовая компания,2y,QA Automation,,2022-03-01T00:00:00+02:00
1,10 + Blockchain Nodes / Masternodes to set up,*Requirements*\r\n\r\nWe're looking for a long...,MyCointainer,2y,Sysadmin,intermediate,2020-10-01T00:00:00+03:00
2,10 .NET Developers (Middle and Senior level),"Greetings! My name is Maria, I am in urgent ne...",TechScout.tech,2y,.NET,intermediate,2022-03-01T00:00:00+02:00
3,"10X Engineer (co-founder, #4 employee, USD 11-...",**Product**\r\nThe product is a live video cha...,Innoteka,5y,JavaScript,fluent,2021-07-01T00:00:00+03:00
4,16 - Amazon Brand Manager,"Currently, TCM expanding its activities to Ukr...",FirstFive,2y,Marketing,upper,2022-01-01T00:00:00+02:00


In [105]:
df_candidates.shape, jobs.shape

((280366, 9), (443458, 7))

## Postion

In [106]:
len(df_candidates['Position_cleaned'].unique()), len(jobs['Position'].unique())

(64694, 160827)

In [107]:
jobs['Position'].value_counts().head(20)

Position
DevOps Engineer            3744
Project Manager            3513
Senior Java Developer      3058
Business Analyst           2700
QA Engineer                1959
Java Developer             1945
Senior DevOps Engineer     1932
IT Recruiter               1547
PHP Developer              1518
Manual QA Engineer         1464
Android Developer          1319
Python Developer           1256
Senior Python Developer    1219
Sales Manager              1207
Middle Java Developer      1148
UI/UX Designer             1140
Data Engineer              1094
React Native Developer     1060
Product Manager            1029
Node.js Developer           953
Name: count, dtype: int64

In [112]:
jobs['Position'].value_counts().tail(20)

Position
Magento Developer (Копенгаген)                  1
Magento developer (Sweden)                      1
Magento developer Senior                        1
Magento Developer (Saudi Arabia)                1
Magento Developer (Poland / Ukraine)            1
Magento Developer, Poland                       1
Magento Developer (Poland)                      1
Magento Developer needed                        1
Magento Developer (Middle to Senior)            1
Magento Developer (Middle-Senior)               1
Magento developer (Middle)                      1
Magento developer (marketplace)                 1
Magento developer (In-house)                    1
Magento Developer (Full stack)                  1
Magento Developer Frontend/Backend              1
Magento developer for US company                1
Magento Developer - ERP Product Integrations    1
Magento Developer - English Speaking            1
Magento Developer (backend)                     1
Юрист (формат часткової зайнятості)      

In [113]:
# show empty positions
jobs[jobs['Position'].isnull()]

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published


## Long Description

In [114]:
# number of nulls in Long Description
print('Number of nulls in Long Description: ', jobs['Long Description'].isnull().sum())

Number of nulls in Long Description:  8


In [115]:
jobs[jobs['Long Description'].isnull()]

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published
75270,Front-end developer,,Екстракод,1y,JavaScript,,2022-05-01T00:00:00+03:00
81322,Front-end Developer в Par Soft,,Par Soft,1y,JavaScript,,2021-05-01T00:00:00+03:00
96421,Full stack with Laravel or Symfony,,Make it in UA,3y,PHP,intermediate,2021-04-01T00:00:00+03:00
108689,HR менеджер,,CyberBionic Systematics,no_exp,HR,intermediate,2022-01-01T00:00:00+02:00
145562,Junior QA Engineer,,QA Madness,no_exp,QA,intermediate,2021-11-01T00:00:00+02:00
424619,Unity3D developer for console porting (Switch ...,,Restless Corp,1y,Unity,upper,2020-08-01T00:00:00+03:00
426499,User Acquisition Manager для iOS продукта,,ABC Mobile,1y,Product Manager,,2022-11-01T00:00:00+02:00
430843,Web/Frontend (Angular) Software Engineer,,Smartbox,3y,JavaScript,intermediate,2021-03-01T00:00:00+02:00


In [116]:
# min, max and mean lean of Long Description
print('Min length of Long Description: ', jobs['Long Description'].str.len().min())
print('Max length of Long Description: ', jobs['Long Description'].str.len().max())
print('Mean length of Long Description: ', jobs['Long Description'].str.len().mean())

Min length of Long Description:  51.0
Max length of Long Description:  14182.0
Mean length of Long Description:  1833.3905085127974


In [132]:
# quarties of Long Description length
jobs['Long Description'].str.len().quantile([0.05, 0.25, 0.5, 0.75, 0.95, 0.999])

0.050     606.000
0.250    1144.000
0.500    1653.000
0.750    2319.000
0.950    3670.000
0.999    6782.551
Name: Long Description, dtype: float64

In [126]:
# show all which length of Long Description less than 0.05 quantile
jobs[jobs['Long Description'].str.len() < jobs['Long Description'].str.len().quantile(0.05)]

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published
1,10 + Blockchain Nodes / Masternodes to set up,*Requirements*\r\n\r\nWe're looking for a long...,MyCointainer,2y,Sysadmin,intermediate,2020-10-01T00:00:00+03:00
40,1c8 programmer,У нас устойчивый диверсифицированный бизнес - ...,Днепропластавтомат,3y,Project Manager,,2022-02-01T00:00:00+02:00
53,1C developer,**Обязанности:**\r\n- разработка на платформе ...,Новая почта,1y,Other,,2022-02-01T00:00:00+02:00
114,1C программист,Функциональные обязанности:\r\nРазработка и по...,ad.ua,1y,Business Analyst,,2021-12-01T00:00:00+02:00
136,1 to 4 Senior Backend .NET Developers,What we definitely expect from you:\r\nBack En...,AOG.jobs,5y,.NET,upper,2021-09-01T00:00:00+03:00
...,...,...,...,...,...,...,...
443391,"Шукаємо ReactNative розробника (Node.js, Rest ...",**Завдання:**\r\n- Кодити.\r\n\r\n**Додаткові ...,SimpleSmart,1y,JavaScript,pre,2022-08-01T00:00:00+03:00
443394,"Шукаємо Yii2 розробника (PHP, Yii2, JS, jQuery)","Шукаємо на проект - PR Motion, сайт для розкру...",inDev,2y,PHP,pre,2023-08-01T00:00:00+03:00
443395,Шукаємо гуру мобільного дизайна,Шукаємо крутого дизайнера по мобільним додатка...,Raccoon.Recovery,3y,Design,,2022-10-01T00:00:00+03:00
443399,Шукаємо С# розробника,**Основне завдання:**\r\n- Створення та підтри...,Etnocode,1y,.NET,,2023-02-01T00:00:00+02:00


In [138]:
# show all which length of Long Description higher than 0.99 quantile
jobs[jobs['Long Description'].str.len() > jobs['Long Description'].str.len().quantile(0.99)]

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published
39,1940 Middle Java Developer + Welcome Bonus!,"Work at Exadel - Who We Are:\r\nSince 1998, Ex...",Exadel,2y,Java,intermediate,2021-11-01T00:00:00+02:00
717,2D Artist,Мы давно рассматривали в компании развите Gami...,LeaSoft,1y,Other,intermediate,2021-10-01T00:00:00+03:00
1602,"3D Artist (""Instant War"" game) / 3D Художник (...",We are looking for a 3D Artist to join the dev...,Playwing Ukraine,3y,Artist,upper,2022-11-01T00:00:00+02:00
1603,"3D Artist (""Instant War"" game) / 3D Художник (...",We are looking for a 3D Artist to join the dev...,Playwing Ukraine,3y,Artist,upper,2022-12-01T00:00:00+02:00
1798,3D Environment Artist,We are looking for a 3D Environment Artist to ...,Playwing,1y,Artist,upper,2022-09-01T00:00:00+03:00
...,...,...,...,...,...,...,...
442917,Технічний керівник / Python Tech Lead,Enestech розробила SaaS рішення - SENET - для...,Techiia,5y,Python,intermediate,2023-01-01T00:00:00+02:00
443118,Фахівець з фінансового моніторингу (відповідал...,"Привіт, ми FinTech компанія **MyCredit**. Зара...",MyCredit,3y,Other,,2023-04-01T00:00:00+03:00
443126,Фахівець по роботі з клієнтами / Customer Succ...,Наша команда “Customer Success” (3 фахівці) пр...,TECHIIA Holding,no_exp,Support,fluent,2023-01-01T00:00:00+02:00
443447,Юрист-международник (Netpeak Group),"Наша вакансия юриста-международника для тех, к...",Netpeak Group,1y,Other,upper,2021-09-01T00:00:00+03:00


In [139]:
# length of all Long Description
print('Length of all Long Description: ', len(jobs['Long Description']))

# length unique Long Description
print('Length unique Long Description: ', len(jobs['Long Description'].unique()))


Length of all Long Description:  443458
Length unique Long Description:  401848


In [140]:
# drop Long Description duplicates
jobs.drop_duplicates(subset=['Long Description'])

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published
0,104.ua приглашает в команду разработки Платфор...,104.ua приглашает в команду разработки Платфор...,Региональная газовая компания,2y,QA Automation,,2022-03-01T00:00:00+02:00
1,10 + Blockchain Nodes / Masternodes to set up,*Requirements*\r\n\r\nWe're looking for a long...,MyCointainer,2y,Sysadmin,intermediate,2020-10-01T00:00:00+03:00
2,10 .NET Developers (Middle and Senior level),"Greetings! My name is Maria, I am in urgent ne...",TechScout.tech,2y,.NET,intermediate,2022-03-01T00:00:00+02:00
3,"10X Engineer (co-founder, #4 employee, USD 11-...",**Product**\r\nThe product is a live video cha...,Innoteka,5y,JavaScript,fluent,2021-07-01T00:00:00+03:00
4,16 - Amazon Brand Manager,"Currently, TCM expanding its activities to Ukr...",FirstFive,2y,Marketing,upper,2022-01-01T00:00:00+02:00
...,...,...,...,...,...,...,...
443453,Юрист-міжнародник,LIGA ZAKON — продуктова IT-компанія — українсь...,LIGA ZAKON,2y,Other,upper,2023-03-01T00:00:00+02:00
443454,Юрист-міжнародник,"Компанія, що займається юридичним супроводом б...",,1y,Other,upper,2023-04-01T00:00:00+03:00
443455,Юрист по договорной работе,Мы ищем в команду Юриста по договорной работе....,C4R,2y,Other,,2021-04-01T00:00:00+03:00
443456,"Юрист у сфері It, nft","Ми створюємо новий NFT проєкт, де колекції з т...",HYPELABS,3y,Other,upper,2022-06-01T00:00:00+03:00


In [None]:
# show column duplicates
df_candidates[df_candidates['CV'].duplicated(keep=False)].sort_values('CV').head(20)

## Company Name

In [141]:
# uniqu company names
print('Unique company names: ', len(jobs['Company Name'].unique()))

Unique company names:  15132


In [142]:
# check nulls in Company Name
print('Number of nulls in Company Name: ', jobs['Company Name'].isnull().sum())

Number of nulls in Company Name:  1881


In [143]:
# show all which Company Name is null
jobs[jobs['Company Name'].isnull()]

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published
454,2D Animator,Обязанности:\r\n- создание 2d анимации (персон...,,2y,Unity,,2021-04-01T00:00:00+03:00
699,2D Artist,Задачи:\r\n\r\n-разработка 2D-графики для виде...,,1y,Other,,2021-06-01T00:00:00+03:00
700,2D Artist,Задачи:\r\n- создание графики для мобильных иг...,,2y,Other,,2020-10-01T00:00:00+03:00
701,2D Artist,Задачи:\r\n- создание графики для мобильных иг...,,2y,Other,,2021-06-01T00:00:00+03:00
736,2D Artist,Необходимые навыки:\r\nУверенное знание графич...,,3y,Design,intermediate,2022-03-01T00:00:00+02:00
...,...,...,...,...,...,...,...
442462,Стажер-аналітик облікових систем,Необхідні навички\r\n• повна вища освіта або с...,,no_exp,SQL,intermediate,2021-08-01T00:00:00+03:00
443254,Фінансовий менеджер (у відділ маркетингу),Готові взяти людину з мінімальним досвідом та ...,,1y,Marketing,intermediate,2023-06-01T00:00:00+03:00
443255,Фінансовий менеджер (у відділ маркетингу),Готові взяти людину з мінімальним досвідом та ...,,1y,Other,,2023-06-01T00:00:00+03:00
443401,Шукаю розробника з досвідом в WebRTC\RingRTC,Ми розробляэм мессенджер на базі Сигнал мессен...,,3y,Rust,intermediate,2023-04-01T00:00:00+03:00


In [12]:
import pandas as pd

df = pd.read_csv('../data/prepared_jobs/intermediate_jobs_prepared.csv')
df.head()

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published,Long Description_lang,id
0,10 + Blockchain Nodes / Masternodes to set up,*Requirements*\r\n\r\nWe're looking for a long...,MyCointainer,2y,Sysadmin,intermediate,2020-10-01T00:00:00+03:00,en,c0ca96e7-85df-50df-a64e-d934cd02a170
1,10 .NET Developers (Middle and Senior level),"Greetings! My name is Maria, I am in urgent ne...",TechScout.tech,2y,.NET,intermediate,2022-03-01T00:00:00+02:00,en,64f4b7ea-36e4-5bdd-a8b1-185f32f7dc7f
2,"10X Engineer (co-founder, #4 employee, USD 11-...",**Product**\r\nThe product is a live video cha...,Innoteka,5y,JavaScript,fluent,2021-07-01T00:00:00+03:00,en,b9a1303e-dd0c-5ed1-8f62-be2bc4c7da4f
3,16 - Amazon Brand Manager,"Currently, TCM expanding its activities to Ukr...",FirstFive,2y,Marketing,upper,2022-01-01T00:00:00+02:00,en,99cb3f4a-9b4b-53d9-9a3b-bab2c22da346
4,16 - Amazon Brand Manager,"Currently, TCM expanding its activities to Ukr...",FirstFive,3y,SEO,upper,2022-01-01T00:00:00+02:00,en,ae75d54a-9fbd-5b1c-8901-41e3e656b64e


In [14]:
df[df['Long Description_lang'] == 'uk']#.head()

Unnamed: 0,Position,Long Description,Company Name,Exp Years,Primary Keyword,English Level,Published,Long Description_lang,id
30,1c developer,Компанія співпрацює з такими лідерами фарма і...,AboutHR,2y,Other,,2021-06-01T00:00:00+03:00,uk,0e8d1ddb-b002-53c2-a761-20406026f153
31,1C developer,"IT компанія “Нова Пошта Діджитал”, що входить ...",Nova Poshta Digital,3y,Other,,2023-03-01T00:00:00+02:00,uk,3f65951b-c5c3-5472-a39b-6bbf516addc1
32,1C developer,Вимоги до успішного кандидата:\r\n— Програміст...,Zakaz.ua,3y,Other,,2022-02-01T00:00:00+02:00,uk,741d9555-9324-54ca-af5b-de4887b0f978
33,1C developer,ОККО запрошує стати частиною потужної команди ...,OKKO Group,3y,Other,,2021-07-01T00:00:00+03:00,uk,5002f118-575e-5325-83c5-8075e5554bdf
34,1C Developer,**Everad** — міжнародна CPA-мережа з прямим ре...,Everad,3y,Other,,2023-04-01T00:00:00+03:00,uk,70a9a45b-302f-59a5-a4e1-ee0da427a5f1
...,...,...,...,...,...,...,...,...,...
358486,"Юрист (ІТ, супровід діяльності міжнародної ком...",Вимоги:\r\n- досвід роботи з договорами у сфер...,Wallet Factory,3y,Other,upper,2021-09-01T00:00:00+03:00,uk,5521dd76-3cfa-5a93-a0f7-5a228d53c3da
358487,Юрист-міжнародник,LIGA ZAKON — продуктова IT-компанія — українсь...,LIGA ZAKON,3y,Other,fluent,2023-02-01T00:00:00+02:00,uk,92de0520-f910-5b82-b958-d3e439e325fa
358488,Юрист-міжнародник,LIGA ZAKON — продуктова IT-компанія — українсь...,LIGA ZAKON,2y,Other,upper,2023-03-01T00:00:00+02:00,uk,7a673fb2-778d-5321-8c3b-4fb9bfa994c6
358489,"Юрист у сфері It, nft","Ми створюємо новий NFT проєкт, де колекції з т...",HYPELABS,3y,Other,upper,2022-06-01T00:00:00+03:00,uk,821f47e4-1264-5d2a-b4c9-49467ec86045


In [4]:
df.tail()

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,CV,CV_lang,id
234891,Юрист,-Providing legal support to management to ensu...,,,Other,fluent,5.0,\n-Providing legal support to management to en...,en,e15bc2c9-62b4-55aa-a283-7f94bf8d1262
234892,Юрист,Supported IT startups in establishing legal an...,,,Other,fluent,1.5,\nSupported IT startups in establishing legal ...,en,83f0a284-238a-5d53-86b2-b2b1c6278a01
234893,"юрист, адвокат,Lawyer,Legal Counsel, Barrister",Description of duties and related accomplishme...,"Мy greetings, colleagues!!\r\nI will be glad t...","I’m a Lawyer, who is interested in developing ...",Other,intermediate,11.0,"I’m a Lawyer, who is interested in developing ...",en,bf451c72-6bb0-5543-a53d-5fe7a423132d
234894,Юрист з інтелектуальної власності,I am an IP Lawyer with a three years’ experien...,,I have experience in various intellectual prop...,Lead,upper,3.0,I have experience in various intellectual prop...,en,ca3b4b86-7d78-5154-b940-cea1f5aedb94
234895,Юрист/Старший юрист/Керівник відділу,I am international lawyer with 9+ years of exp...,,"Two years ago, I decided to upgrade myself in ...",Other,fluent,9.0,"Two years ago, I decided to upgrade myself in ...",en,0fbd5951-0d4c-5df9-84f4-9df674842c30


In [10]:
df[df.CV_lang == 'uk']

Unnamed: 0,Position,Moreinfo,Looking For,Highlights,Primary Keyword,English Level,Experience Years,CV,CV_lang,id
0,1c,"Розробник 1с/BAS, всі українські конфігурації,...",,,Other,intermediate,11.0,"\nРозробник 1с/BAS, всі українські конфігураці...",uk,dcaeffc4-1d1f-5754-a567-f478f0c8aa6f
1,1c developer,Еко - маркет Торгівля роздрібна 1 рік 9 місяці...,,Hillel IT school Java Basic and Java Pro 2022\...,SQL,pre,11.0,Hillel IT school Java Basic and Java Pro 2022\...,uk,2767ca2d-4bca-5b41-b9e0-e42ad5700b6f
2,1C developer (purpose: switch to Golang/Ruby),Займаюся автоматизацією підприємств вже понад ...,"Найголовніше - це атмосфера в колективі, також...","Самостійно приймати рішення, брати відповідаль...",Ruby,upper,7.0,"Самостійно приймати рішення, брати відповідаль...",uk,ff07df1f-5268-5865-8f30-1cc660786494
3,1C програміст,Шукаю роботу з метою розвитку та вдосконалення...,В першу чергу мене цікавить можливість розвитк...,"Пройшов курси: «1С за 21 день, 1С 8.3» та «1С ...",SQL,intermediate,0.5,"Пройшов курси: «1С за 21 день, 1С 8.3» та «1С ...",uk,da4b79b1-fbc5-5dfe-bba0-58339f262d11
4,1C програміст,Шукаю роботу по напрямку junior С++. Маю досві...,"Цікаві проекти, саморозвиток.",,Data Engineer,intermediate,6.0,\nШукаю роботу по напрямку junior С++. Маю дос...,uk,b118a952-cbe6-58ac-824a-10de5322570c
...,...,...,...,...,...,...,...,...,...,...
24283,Юрист/Lawyer (Адвокат),8 років досвіду в юридичному секторі. Адвокат ...,,,Other,pre,8.0,\n8 років досвіду в юридичному секторі. Адвока...,uk,a8a3aa6c-894a-5390-a076-ff1ae9f2abcb
24284,Юрист з договірного права,Маю досвід роботи у сфері договірного права. Д...,,,Other,intermediate,11.0,\nМаю досвід роботи у сфері договірного права....,uk,fa145b67-9fb4-5898-a0f4-0f90367a0202
24285,"Юрист, Помощник Юриста","Бухгалтер з первинної документації, міжнародна...",В мене є досвід співпраці з адвокатською орган...,Диплом магістра з правознавства з відзнакою;\r...,Other,intermediate,0.0,Диплом магістра з правознавства з відзнакою;\r...,uk,196c8d39-b227-5519-b57c-3a08668bded0
24286,"Юрист, юрисконсульт",Проводила державну реєсуацію юридичних фактів ...,,,Other,no_english,6.0,\nПроводила державну реєсуацію юридичних факті...,uk,e3d21e73-e3f4-51e2-8344-01323c125863


In [9]:
df.CV[234891]

"\n-Providing legal support to management to ensure the Company’s activity complies with the relevant legislation and its rights are protected;\r\n- Developing policies and procedures of the Company;\r\n- Developing, reviewing and examining commercial contracts and relevant documents;\r\n- Managing intellectual property issues;\r\n- Legal expertise of marketing and sales departments' documents to ensure compliance with antitrust legislation;\r\n- Managing corporate procedures: decrease of share capital, preparation of necessary documents for General Meetings of Shareholders, preparation of Supervisory Council documents;\r\n- Conducting training lectures for company employees;\r\n- Managing the activity of the Legal department.\n"