In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_analytic_description = ''
with open('job_descriptions/job_desc_data_analyst.txt') as f:
    data_analytic_description = f.read()

In [3]:
data_science_description = ''
with open('job_descriptions/job_desc_data_scientist.txt') as f:
    data_science_description = f.read()

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
def stemmer(text):
    stemmer = SnowballStemmer(language='russian')
    tokenizer = nltk.tokenize.WhitespaceTokenizer()

    tokens = [stemmer.stem(w) for w in tokenizer.tokenize(text)]

    return ' '.join(tokens)

In [6]:
def clean_text(text):

    text = text.lower() 

    text = re.sub(r'\n', '', text)

    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    text = text.split()
    useless_words = stopwords.words("english") + stopwords.words("russian") + ['добрый', 'день', 'hi', 'также']

    text_filtered = [word for word in text if not word in useless_words]

    final_string = ' '.join(text_filtered)

    return final_string

In [7]:
def cosine_similarity_score(df, column_name, contents, new_column_name):
    df_desc = pd.DataFrame()
    df_desc[column_name] = df[column_name].values.astype('str')
    contents = clean_text(contents)
    contents = stemmer(contents)
    df_desc = df_desc.append({column_name: contents}, ignore_index=True)
    df_desc = df_desc.fillna(' ')

    descriptions = df_desc[column_name].values
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(descriptions)

    co_sim = cosine_similarity(X, X)

    df[new_column_name] = co_sim[-1][:-1]
    
    return df

In [8]:
data = pd.read_csv('data_preprocessed.csv')

In [9]:
data = cosine_similarity_score(data, 'LastWorkDesc', data_analytic_description, 'Cosine_DA_LastWorkDesc')
data = cosine_similarity_score(data, 'LastWorkDesc2', data_analytic_description, 'Cosine_DA_LastWorkDesc2')
data = cosine_similarity_score(data, 'Description', data_analytic_description, 'Cosine_DA_Description')

  df_desc = df_desc.append({column_name: contents}, ignore_index=True)
  df_desc = df_desc.append({column_name: contents}, ignore_index=True)
  df_desc = df_desc.append({column_name: contents}, ignore_index=True)


In [10]:
data = cosine_similarity_score(data, 'LastWorkDesc', data_science_description, 'Cosine_DS_LastWorkDesc')
data = cosine_similarity_score(data, 'LastWorkDesc2', data_science_description, 'Cosine_DS_LastWorkDesc2')
data = cosine_similarity_score(data, 'Description', data_science_description, 'Cosine_DS_Description')

  df_desc = df_desc.append({column_name: contents}, ignore_index=True)
  df_desc = df_desc.append({column_name: contents}, ignore_index=True)
  df_desc = df_desc.append({column_name: contents}, ignore_index=True)


In [11]:
data = data.drop(['LastWorkDesc', 'LastWorkDesc2', 'Description'], axis=1)

In [12]:
data.head()

Unnamed: 0,ExpPeriod,Salary,Age,Gender,WorkType,WorkSchedule,N_places,Top 10 work,N_changeWork,N_Langs,...,Top 10 Uni,URL,top_city,EduLevelCat,Cosine_DA_LastWorkDesc,Cosine_DA_LastWorkDesc2,Cosine_DA_Description,Cosine_DS_LastWorkDesc,Cosine_DS_LastWorkDesc2,Cosine_DS_Description
0,-1.018449,-0.032825,-0.997974,2.0,1.0,0.0,-0.643175,0.0,-0.228412,-0.290421,...,1.0,https://hh.ru/resume/c107054800051cb16b0039ed1...,1.0,1.0,0.026102,0.082078,0.030898,0.030294,0.062736,0.065384
1,-0.51429,-0.032825,-0.483565,2.0,1.0,0.0,0.350588,0.0,-0.228412,2.157769,...,0.0,https://hh.ru/resume/bebf6a630008069d8c0039ed1...,1.0,1.0,0.03775,0.050829,0.131274,0.031396,0.042304,0.105672
2,-1.34467,-0.032825,0.324791,3.0,1.0,0.0,-0.974429,0.0,-0.228412,-0.290421,...,0.0,https://hh.ru/resume/8c6333c200090aa8b60039ed1...,1.0,1.0,0.09863,0.145485,0.133424,0.062048,0.090375,0.151029
3,-0.099101,-0.032825,-0.189618,2.0,1.0,0.0,1.013097,0.0,-0.228412,0.933674,...,0.0,https://hh.ru/resume/993bd0980002dd6cff0039ed1...,1.0,1.0,0.039134,0.061517,0.036506,0.044509,0.067402,0.037339
4,-1.186502,-0.032825,-1.071461,3.0,1.0,0.0,-0.643175,1.0,3.064879,-0.290421,...,1.0,https://hh.ru/resume/628596ac000657935b0039ed1...,1.0,1.0,0.037656,0.066113,0.079162,0.054239,0.074761,0.104707


In [13]:
data.to_csv('cosine_similarity_df.csv', index=False)