In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import re
from string import punctuation
from pymystem3 import Mystem
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [None]:
nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
snowball = SnowballStemmer(language="russian")
mystem = Mystem()

In [None]:
data_path = 'C:\\Users\\zer0nu11\\Documents\\grad\\skolkovohack2022\\data\\for_hack_2022\\'
test_path = 'C:\\Users\\zer0nu11\\Documents\\grad\\skolkovohack2022\\data\\for_hack_2022\\test\\'

In [None]:
jobs_labels = ['JobId','Status','Name','Region','Description']
test_jobs_labels = ['JobId','Status','Name','Region','Description','nan1','nan2','nan3']
candidates_labels = ['CandidateId', 'Position', 'Sex', 'Citizenship', 'Age', 'Salary',
       'Langs', 'DriverLicense', 'Subway', 'Skills', 'Employment', 'Schedule',
       'CandidateRegion','DateCreated','JobId','CandidateStatusId','Status']
candidates_education_labels = ['CandidateId', 'University', 'Faculty', 'GraduateYear']
candidates_workplaces_labels = ['CandidateId', 'Position', 'FromYear', 'FromMonth', 'ToYear',
       'ToMonth']

data_jobs = pd.read_csv(data_path+'data_jobs.csv',sep=';',names=jobs_labels)
data_candidates_workplaces = pd.read_csv(data_path+'data_candidates_work_places.csv',sep=';',names=candidates_workplaces_labels)
data_candidates_education = pd.read_csv(data_path+'data_candidates_education.csv',sep=';',names=candidates_education_labels)
data_candidates = pd.read_csv(data_path+'data_candidates.csv',sep=';',names=candidates_labels)

test_jobs = pd.read_csv(test_path+'test_jobs.csv',sep=';',names=test_jobs_labels)
test_candidates_workplaces = pd.read_csv(test_path+'test_candidates_workplaces.csv',sep=';')
test_candidates_education = pd.read_csv(test_path+'test_candidates_education.csv',sep=';')
test_candidates = pd.read_csv(test_path+'test_candidates.csv',sep=';')

In [None]:
# missing data
data_jobs = data_jobs.fillna('').drop(['Status'],axis=1)
data_candidates_workplaces = data_candidates_workplaces.fillna('').drop(['FromYear','FromMonth','ToYear','ToMonth'],axis=1)
data_candidates_education = data_candidates_education.fillna('').drop(['GraduateYear','University'],axis=1)
data_candidates = data_candidates.fillna('')

test_jobs = test_jobs.fillna('').drop(['Status','nan1','nan2','nan3'],axis=1)
test_candidates_workplaces = test_candidates_workplaces.fillna('').drop(['FromYear','FromMonth','ToYear','ToMonth'],axis=1)
test_candidates_education = test_candidates_education.fillna('').drop(['GraduateYear','University'],axis=1)
test_candidates = test_candidates.fillna('')

In [None]:
assert (data_jobs.columns == test_jobs.columns).all()
# assert (data_candidates.columns == test_candidates.columns).all() # data.shape > test.shape because of relation to job_id
print('Train/Test columns difference:\n',list(data_candidates.columns[~data_candidates.columns.isin(test_candidates.columns)]))
assert (data_candidates_education.columns == test_candidates_education.columns).all()
assert (data_candidates_workplaces.columns == test_candidates_workplaces.columns).all()

In [None]:
def preprocess_signs(text):
    text = text.lower()
    text = re.sub(r'<.*?>', " ", text)
    text = re.sub(r'[\_+\*+\#+\№\"\-+\+\=+\?+\&\^\.+\;\,+\>+\(\)\/+\:\\+]', " ", text)
    text = re.sub(r'[ ]{2,}',' ',text)
    text = text.strip()
    # tokens = mystem.lemmatize(text)
    # tokens = [snowball.stem(token) for token in tokens if token not in russian_stopwords\
    #             and token != " " \
    #             and token.strip() not in punctuation ]
    # text = " ".join(tokens)
    return text

In [None]:
data_jobs[['Name','Region','Description']] = data_jobs[['Name','Region','Description']].applymap(preprocess_signs)
data_candidates_workplaces[['Position']] = data_candidates_workplaces[['Position']].applymap(preprocess_signs)
data_candidates_education[['Faculty']] = data_candidates_education[['Faculty']].applymap(preprocess_signs)
data_candidates[['Position','Citizenship','Langs','DriverLicense', \
        'Subway','Skills','CandidateRegion','Status']] = \
    data_candidates[['Position','Citizenship','Langs','DriverLicense', \
        'Subway','Skills','CandidateRegion','Status']].applymap(preprocess_signs)

test_jobs[['Name','Region','Description']] = test_jobs[['Name','Region','Description']].applymap(preprocess_signs)
test_candidates_workplaces[['Position']] = test_candidates_workplaces[['Position']].applymap(preprocess_signs)
test_candidates_education[['Faculty']] = test_candidates_education[['Faculty']].applymap(preprocess_signs)
test_candidates[['Position','Citizenship','Langs','DriverLicense', \
        'Subway','Skills','CandidateRegion']] = \
    test_candidates[['Position','Citizenship','Langs','DriverLicense', \
        'Subway','Skills','CandidateRegion']].applymap(preprocess_signs)

In [None]:
# list(test_jobs.Description.unique())
list(data_jobs.Description.unique())
# data_candidates_workplaces
# test_candidates_workplaces
# data_candidates_education
# test_candidates_education

# data_candidates
# test_candidates

In [None]:
# data_candidates = data_candidates.merge(data_candidates_education, left_on='CandidateId', right_on='CandidateId')
data_candidates
# data_candidates_education
# data_candidates_workplaces

In [None]:
df = data_candidates_education
x = df.CandidateId.value_counts().unique()
df = df.groupby('CandidateId')['Faculty'].apply(list).reset_index()
df[df.CandidateId == 432].Faculty.tolist()
# df[df.Faculty.shape[0]>1]
# df[df.CandidateId.isin(x.index[x.gt(2)])]

In [None]:
data_candidates.CandidateRegion.unique()

In [128]:
data_candidates

Unnamed: 0,CandidateId,Position,Sex,Citizenship,Age,Salary,Langs,DriverLicense,Subway,Skills,Employment,Schedule,CandidateRegion,DateCreated,JobId,CandidateStatusId,Status
0,7435,водитель экспедитор,2,россия,21,0,,,,,Full,Full,санкт петербург,2014-01-15 00:00:00.0000000,163,1425,отклонен
1,7445,водитель экспедитор,2,россия,23,0,,,,,Full,Full,санкт петербург,2014-01-15 00:00:00.0000000,163,1425,отклонен
2,7450,водитель экспедитор,2,россия,31,0,,,,,Full,Full,санкт петербург,2014-01-15 00:00:00.0000000,163,1425,отклонен
3,7452,водитель экспедитор,2,россия,23,0,,,,,Full,Full,санкт петербург,2014-01-15 00:00:00.0000000,163,1425,отклонен
4,7453,водитель экспедитор,2,россия,27,0,,,,,Full,Full,санкт петербург,2014-01-15 00:00:00.0000000,163,1425,отклонен
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141884,24207796,водитель погрузчика штабелера,2,россия,36,65000,русский родной||,,,,Part,Flex,москва,2022-08-25 10:04:23.0000000,370097,126165,телефонное интервью состоялось
141885,24207796,водитель погрузчика штабелера,2,россия,36,65000,русский родной||,,,,Part,Flex,москва,2022-08-25 10:04:23.0000000,370097,126162,телефонное интервью
141886,24228268,водитель электропогрузчика,2,россия,0,60000,русский родной||,,,водитель погрузчика || водитель электроштабелё...,Part,Flex,москва,2022-08-26 16:10:07.0000000,370097,126165,телефонное интервью состоялось
141887,24228268,водитель электропогрузчика,2,россия,0,60000,русский родной||,,,водитель погрузчика || водитель электроштабелё...,Part,Flex,москва,2022-08-26 16:10:07.0000000,370097,126162,телефонное интервью
