In [62]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from LogisticRegression_task import *

In [63]:
# Загружаем данные
train = pd.read_csv('binary_clf_data.csv')
test = pd.read_csv('dataset_527992_9.txt')

# Оставляем только нужные колонки
train = train[['category_name', 'subcategory_name', 'param1', 'param2', 'gender', 'user_id']]
test = test[['category_name', 'subcategory_name', 'param1', 'param2', 'user_id']]

# Переводим категории в OHE,
# handle_unknown='ignore' помогает решить проблема появления новых категорий в тесте 
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
train_enc = enc.fit_transform(train[['category_name', 'subcategory_name', 'param1', 'param2']])
test_enc = enc.transform(test[['category_name', 'subcategory_name', 'param1', 'param2']])

train = pd.concat([train[['gender', 'user_id']], pd.DataFrame(train_enc, columns=[name[3:] for name in enc.get_feature_names()])], axis=1)
test = pd.concat([test['user_id'], pd.DataFrame(test_enc, columns=[name[3:] for name in enc.get_feature_names()])], axis=1)

# Агрегируем по user_id и gender, а reset_index() поможет оставить эти столбцы в нормальном виде
train = train.groupby(['user_id', 'gender']).sum().reset_index()
test = test.groupby(['user_id']).sum().reset_index()

# Закодируем gender
train['gender'] = train['gender'].apply(lambda x: 1 if x == 'male' else 0)

# Делим выборку
X_train, X_val, y_train, y_val = train_test_split(train.drop(['gender', 'user_id'], axis=1), train['gender'], test_size=0.25)

In [70]:
clf = LogisticRegression(max_iter=24e3, lr=0.015, tol=0.001, reg_coef=6e-4, print_every=3000)
clf.fit(np.array(X_train), np.array(y_train).reshape(-1, 1), np.array(X_val), np.array(y_val).reshape(-1, 1))

3000 completed. accuracy on train: 0.7063326374391092,             val: 0.6784968684759917,  grad norm: 0.02568150394257666
6000 completed. accuracy on train: 0.7265135699373695,             val: 0.6889352818371608,  grad norm: 0.018255402031112856
9000 completed. accuracy on train: 0.7383437717466945,             val: 0.7118997912317327,  grad norm: 0.014558050932321558
12000 completed. accuracy on train: 0.7432150313152401,             val: 0.7098121085594989,  grad norm: 0.012283506915207845
15000 completed. accuracy on train: 0.7494780793319415,             val: 0.7139874739039666,  grad norm: 0.010705204699119498
18000 completed. accuracy on train: 0.7543493389004872,             val: 0.7181628392484343,  grad norm: 0.009520174976905068
21000 completed. accuracy on train: 0.756437021572721,             val: 0.7202505219206681,  grad norm: 0.008581111967909668
24000 completed. accuracy on train: 0.7543493389004872,             val: 0.7118997912317327,  grad norm: 0.0078083693329071

<LogisticRegression_task.LogisticRegression at 0x213265ca640>

In [67]:
predictions = clf.predict(np.array(test.drop('user_id',axis=1)))
test['gender'] = predictions
test['gender'] = test['gender'].apply(lambda x: 'male' if x == 1 else 'female')
test[['user_id', 'gender']].to_csv('test_predictions.csv', index=False)