In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import joblib

In [24]:
queries_df = pd.read_csv('dataset_users_query.csv')
answers_df = pd.read_csv('answers.csv')

queries_df['query'] = queries_df['query'].str.lower().str.strip()
answers_df['query'] = answers_df['query'].str.lower().str.strip()

employment_df = answers_df.drop(['Unnamed: 0', 'по должности-лемме', 'по дополнительному признаку', 'по условиям', 'общие фразы'], axis = 1)
position_df = answers_df.drop(['Unnamed: 0', 'занятость', 'по дополнительному признаку', 'по условиям', 'общие фразы'], axis = 1)
additance_df = answers_df.drop(['Unnamed: 0', 'занятость', 'по должности-лемме', 'по условиям', 'общие фразы'], axis = 1)
conditions_df = answers_df.drop(['Unnamed: 0', 'занятость', 'по должности-лемме', 'по дополнительному признаку', 'общие фразы'], axis = 1)
phrase_df = answers_df.drop(['Unnamed: 0', 'занятость', 'по должности-лемме', 'по дополнительному признаку', 'по условиям'], axis = 1)

In [25]:
employment_keywords = list(answers_df['занятость'].unique()[1:])
position_keywords = list(answers_df['по должности-лемме'].unique()[1:])
additance_keywords = list(answers_df['по дополнительному признаку'].unique()[1:])
conditions_keywords = list(answers_df['по условиям'].unique()[1:])
common_phrases = ['работа', 'вакансии']

In [26]:
encoder = LabelEncoder()

position_df['category_encoded'] = encoder.fit_transform(position_df['по должности-лемме'])

In [27]:
label_dict = {i: class_name for i, class_name in enumerate(encoder.classes_)}

In [28]:
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(position_df['query'])
y = position_df['category_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
model_query = LogisticRegression(max_iter=1000)
model_query.fit(X_train, y_train)

In [30]:
y_pred = model_query.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         3
           4       1.00      0.33      0.50         9
           6       0.00      0.00      0.00         1
           7       1.00      0.79      0.88        24
           8       0.00      0.00      0.00         2
           9       1.00      0.25      0.40         4
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         1
          13       1.00      0.75      0.86         8
          14       0.00      0.00      0.00         1
          15       1.00      0.20      0.33         5
          16       1.00      0.80      0.89         5
          17       0.00      0.00      0.00         1
          18       1.00      0.17      0.29         6
          19       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         3
          21       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
joblib.dump(model_query, 'position_model.pkl')

['position_model.pkl']

In [33]:
joblib.dump(vectorizer, 'position_vectorizer.pkl')

['position_vectorizer.pkl']