In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import SGDClassifier as SGDC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, roc_auc_score
from collections import defaultdict
from datetime import datetime
%matplotlib inline

Читаем входные файлы с данными

In [2]:
transactions = pd.read_csv('./datasets/transactions/transactions.csv')
customers_gender = pd.read_csv('./datasets/transactions/customers_gender_train.csv')

Для каждого клиента подсчитываем различные mcc_code

In [3]:
customers_gender = customers_gender.sort_values(by="customer_id")

In [4]:
customers_ids = customers_gender["customer_id"].values

In [5]:
customers_ids

array([    6815,    22899,    27914, ..., 99988578, 99991245, 99999680])

In [7]:
transactions.columns

Index(['customer_id', 'tr_datetime', 'mcc_code', 'tr_type', 'amount',
       'term_id'],
      dtype='object')

In [28]:
transactions[:1]["tr_datetime"][0].split()[1]

'10:23:26'

In [41]:
def transactions_to_numpy(transactions):
    def row_transform(row, term_ids):
        result = []
        result.append(row.tr_datetime.split()[0])
        dt = datetime.strptime(row.tr_datetime[0].split()[1], '%I:%M:%S')
        result.append(dt.hour * 60 + dt.minute)
        result.append(row.mcc_code)
        result.append(row.tr_type)
        result.append(row.amount)
        result.append(row.term_id)
        return np.array(result, dtype=np.int64).reshape(1, -1)
    term_ids = dict(enumerate(sorted(list(set(transactions["term_id"].values)))))
    transactions_numpy = defaultdict(np.array())
    for row in transactions.itertuples():
        transactions_dict[row.customer_id] = np.concatenate([transactions_dict[row.customer_id], 
                                                             row_transform(row, term_ids)])

In [None]:
transformed_transactions = transactions_to_numpy(transactions)

In [None]:
train, test, y_train, y_test = train_test_split(transactions, y, train_size=0.7, random_state=13)

In [3]:
train = transactions.groupby('customer_id') \
                    .apply(lambda x: x[['mcc_code']].unstack().value_counts()) \
                    .unstack() \
                    .fillna(0)

In [5]:
train

Unnamed: 0_level_0,742,1711,1731,1799,2741,3000,3351,3501,4111,4112,...,8299,8398,8641,8699,8999,9211,9222,9311,9399,9402
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
31385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0
49101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
customers_gender = customers_gender.set_index('customer_id')

In [6]:
customers_gender

Unnamed: 0_level_0,gender
customer_id,Unnamed: 1_level_1
75562265,0
10928546,1
69348468,1
84816985,1
61009479,0
74045822,0
27979606,1
54129921,0
23160845,0
44160317,1


In [5]:
Y_train = customers_gender.loc[X.index].gender
Y_train = Y_train.reset_index()
del Y_train['customer_id']
Y_train = Y_train.dropna(0)

In [6]:
X_train = X.reset_index()
X_train = X_train.loc[Y_train.index].set_index('customer_id')

Обучаемся на всех данных

In [7]:
clf = GradientBoostingClassifier(random_state=13)
clf.fit(X_train, Y_train.values[:, 0])

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=13, subsample=1.0, verbose=0,
              warm_start=False)

Предсказываем пол для тестовых данных и создаём файл с ответом

In [10]:
X_test = X.drop(customers_gender.index)
result = pd.DataFrame(X_test.index, columns=['customer_id'])
result['gender'] = clf.predict_proba(X_test)[:, 1]

In [11]:
result.to_csv('baseline_a.csv', index=False)