In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.metrics import accuracy_score
import numbers


class LogisticRegression:

    def __init__(self, max_iter=1e4, lr=0.002, beta=0.5, tol=0.001, print_every=100, l2_coef=0.5, l1_coef=0.1):

        '''
        max_iter – максимальное количеств
        '''

        self.max_iter = max_iter
        self.lr = lr
        self.tol = tol
        self.print_every = print_every
        self.l2_coef = l2_coef
        self.l1_coef = l1_coef
        self.beta = beta

        self.weights = None
        self.bias = None

    def fit(self, X_train, y_train, X_val, y_val):

        '''
        Обучение модели.

        X_train – матрица объектов для обучения
        y_train – ответы на объектах для обучения

        X_val – матрица объектов для валидации
        y_val – ответы на объектах для валидации
        '''

        self.check_binary_clf_X_y(X_train, y_train)
        self.check_binary_clf_X_y(X_val, y_val)

        n, m = X_train.shape
        # self.weights = np.random.random((m, 1)) * 2 - 1
        self.weights = np.zeros((m, 1))
        self.bias = np.mean(y_train)

        v_w, v_b = 0, 0

        n_iter = 0
        gradient_norm = np.inf

        while n_iter < self.max_iter and gradient_norm > self.tol:

            dJdw, dJdb = self.grads(X_train, y_train)
            gradient_norm = np.linalg.norm(np.hstack([dJdw.flatten(), [dJdb]]))

            v_w = self.beta * v_w + (1 - self.beta) * dJdw
            self.weights = self.weights - self.lr * v_w

            v_b = self.beta * v_b + (1 - self.beta) * dJdb
            self.bias = self.bias - self.lr * v_b

            # self.weights = self.weights - self.lr * dJdw
            # self.bias = self.bias - self.lr * dJdb

            n_iter += 1

            if n_iter % self.print_every == 0:
                self.print_metrics(X_train, y_train, X_val, y_val, n_iter, gradient_norm)

        return self

    def predict(self, X):

        '''
        Метод возвращает предсказанную метку класса на объектах X
        '''

        return self.predict_proba(X) > 0.5


    def predict_proba(self, X):

        '''
        Метод возвращает вероятность класса 1 на объектах X
        '''
        return self.sigmoid(X @ self.weights + self.bias)

    def grads(self, X, y):

        '''
        Рассчёт градиентов
        '''
        y_hat = self.predict_proba(X)

        sign = self.weights / (np.abs(self.weights) + 1e-20)
        dJdw = np.mean(X * (y_hat - y), axis=0, keepdims=True).T + self.l1_coef * sign + self.l2_coef * self.weights
        dJdb = np.mean(y_hat - y)

        self.check_grads(dJdw, dJdb)

        return dJdw, dJdb

    @staticmethod
    def sigmoid(x):
        '''
        Сигмоида от x
        '''
        return 1 / (1 + np.exp(-x))

    def print_metrics(self, X_train, y_train, X_val, y_val, n_iter, gradient_norm):

        train_preds = self.predict(X_train)
        val_preds = self.predict(X_val)

        train_acc = accuracy_score(train_preds, y_train)
        val_acc = accuracy_score(val_preds, y_val)

        print(f'{n_iter} completed. accuracy_score on train: {train_acc}, val: {val_acc}, grad_norm: {gradient_norm}')

    def check_grads(self, dJdw, dJdb):

        if not isinstance(dJdb, numbers.Real):
            raise ValueError(f'Производная по параметру b должна быть действительным'
                             f' числом, как и сам параметр b, а у нас {dJdb} типа {type(dJdb)}')

        if dJdw.shape != self.weights.shape:
            raise ValueError(f'Размерность градиента по параметрам w должна совпадать с самим вектором w, '
                             f'а у нас dJdw.shape = {dJdw.shape} не совпадает с weight.shape = {self.weights.shape}')

    @staticmethod
    def check_binary_clf_X_y(X, y):

        if X.shape[0] == 0:
            raise ValueError(f'X и y не должны быть пустыми, а у нас X.shape = {X.shape} и y.shape = {y.shape}')

        if np.isnan(X).any():
            raise ValueError(f'X не должен содержать "not a number" (np.nan)')

        if np.isnan(y).any():
            raise ValueError(f'y не должен содержать "not a number" (np.nan)')

        if X.shape[0] != y.shape[0]:
            raise ValueError(f'Длина X и y должна быть одинаковой, а у нас X.shape = {X.shape}, y.shape = {y.shape}')

        if y.shape[1] != 1:
            raise ValueError(f'y - вектор ответов должен быть размерности (m, 1), а у нас y.shape = {y.shape}')


        if sorted(np.unique(y)) != [0, 1]:
            raise ValueError(f'Ответы на объектах должны быть только 0 или 1, а у нас np.unique(y) = {np.unique(y)}')


In [None]:
df = pd.read_csv('binary_clf_data.csv')

df

In [None]:
def refactor_subcat_name(row):
    if isinstance(row.param1, str) and ('муж' in row.param1.lower() or 'жен' in row.param1.lower()):
        return row.param1
    return row.subcategory_name

def gender_to_num(row):
    if row.gender == 'male':
        return 1
    return 0

In [None]:
df['subcategory_name'] = df.apply(refactor_subcat_name, axis=1)

In [None]:
df[df['gender'] == 'female']

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
df_ohe = ohe.fit_transform(df[['subcategory_name']])

In [None]:
df_extended = pd.DataFrame(np.hstack([df, df_ohe]))
gender = df_extended[[0, 1]].drop_duplicates()

In [None]:
df_extended

In [None]:
df_extended = df_extended.drop(columns=[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]).groupby(1).sum().reset_index().merge(gender, on=1).rename(columns={1: 'user_id', 0: 'gender'})

In [None]:
train, val, train_gender, val_gender = train_test_split(df_extended.drop(['user_id', 'gender'], axis=1), df_extended[['gender']], random_state=260401)

In [None]:
train_gender['gender'] = train_gender.apply(gender_to_num, axis=1)
val_gender['gender'] = val_gender.apply(gender_to_num, axis=1)
train_gender

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train)

In [None]:
score = 0
res = []

for lr in [0.001, 0.002, 0.005]:
    for beta in [0, 0.1, 0.15, 0.25]:
        for l1_coef in [0, 0.001, 0.01, 0.1]:
            for l2_coef in [0, 0.01, 0.1, 0.25]:
                logistic_regression = LogisticRegression(max_iter=3e4, lr=lr, beta=beta, l1_coef=l1_coef, l2_coef=l2_coef, print_every=30000)

                logistic_regression.fit(train_scaled, train_gender.values, val.values, val_gender.values)

                if accuracy_score(logistic_regression.predict(val.values), val_gender.values) > score:
                    res = [lr, beta, l1_coef, l2_coef]
                    score = accuracy_score(logistic_regression.predict(val.values), val_gender.values)
                    if score > 0.75:
                        print(f'score = {score}, parameters: {res}')

In [None]:
score

In [None]:
logistic_regression = LogisticRegression(max_iter=1e5, tol=1e-6, lr=0.002, beta=0.15, l1_coef=0, l2_coef=0.01)

In [None]:
logistic_regression.fit(train_scaled, train_gender.values, val.values, val_gender.values)

In [None]:
test = pd.read_csv('dataset_527992_9.txt')

In [None]:
test_ohe = ohe.transform(test[['category_name', 'subcategory_name']])
test_extended = np.hstack([test, test_ohe])

In [None]:
test_prepared = pd.DataFrame(test_extended).drop(columns=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).rename(columns={0: 'user_id'}).groupby('user_id').sum().reset_index()

In [None]:
test_prepared.values

In [None]:
prediction = logistic_regression.predict(test_prepared.values[:, 1:]).astype(int)

In [None]:
prediction = np.where(prediction == 1, 'male', 'female')

In [None]:
test_prepared = test_prepared.join(pd.DataFrame(prediction))

In [None]:
test_prepared[['user_id', 0]].to_csv('test_predictions.csv', index=False)