In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import random
import json

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from functools import reduce
from html.parser import HTMLParser

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_score

In [4]:
def flatten(x):
    result = []
    for elem in x:
        if hasattr(elem, "__iter__") and not isinstance(elem, str):
            result.extend(flatten(elem))
        else:
            result.append(elem)
    return result

In [5]:
def list_concat(list1, list2):
    
    len1 = len(list1)
    len2 = len(list2)
    
    return [[list1[i], list2[j]] for i in range(len1) for j in range(len2)]

In [6]:
def combinations(params):
    
    list_ = []
    for value in params.values():
        list_.append(value)

    tmp = reduce(lambda x, y: list_concat(x, y), list_)

    res = []
    for elem in tmp:
        param_list = flatten(elem)
        param_dict = dict(zip(params.keys(), param_list))
        res.append(param_dict)
        
    return res

In [7]:
def group_fold_generator(X_train, train_target, groups_train, folds_num=10):
    
    groups_num = np.unique(groups_train).shape[0]
    
    for i in range(folds_num):
        
        a = [random.randint(1, groups_num) for i in range(groups_num // folds_num)]

        test_index = [i for i, x in enumerate(groups_train) if x in a]
        train_index = [i for i, x in enumerate(groups_train) if x not in a]

        yield train_index, test_index

In [8]:
def default_fold_generator(X_train, train_target, folds_num=10, shuffle=False):
    
    folds = KFold(folds_num, shuffle = shuffle)
    
    return folds.split(X_train, train_target)

In [9]:
def validation(X_train, train_target, model, params, folds_gen_func=default_fold_generator, **kwargs):
    
    scaler = StandardScaler()
    scaler.fit(X_train) 
    
    main_res = []
    for param_set in combinations(params):
        
#         print(param_set)        
        exact_model = model(**param_set) 
        
        fold_generator = folds_gen_func(X_train, train_target, **kwargs)
        
        res = []
        for train_index, test_index in fold_generator:

            exact_model.fit(scaler.transform(X_train[train_index]), train_target[train_index])
            res.append(metrics.roc_auc_score(train_target[test_index],\
                                      exact_model.predict(scaler.transform(X_train[test_index]))))
            
        mean = sum(res)/len(res)
#         print(mean)
        main_res.append((mean, param_set))
    
    best = main_res[np.argmax([res[0] for res in main_res])]
    print('--------max-------')
    print(best)
    
    return best

In [46]:
# Сохраняет решение

def save_submission(y_pred):

    data = pd.read_csv('data/test_groups.csv')
    data['target'] = y_pred
    
    data = data.drop(['group_id', 'doc_id'], axis=1)

    data.to_csv("submission.csv", index=False)
    
    info = np.unique(data['target'], return_counts=True)
    
    if info[0].shape[0] > 1:
        
        print('0: {}, 1: {}'.format(info[1][0], info[1][1]))
        if info[1][1] > 3600:
            print('Your submisson is shit')
    
    else:
        print('There are only {} in submission'.format(info[0][0]))
        
    return data

In [11]:
# Переводит numpy ndarray в список

def ndarray_to_list(array):
    
    return list(map(lambda x: list(x), array))

In [12]:
# Загружает файлы с признаками документов по группам

def all_group_feature_list(start_group, finish_group):
    
    res = np.load('group_features/{}.npy'.format(start_group))
  
    for group_num in range(start_group + 1, finish_group + 1):
#         res += ndarray_to_list(np.load('group_features/{}.npy'.format(group_num)))
        res = np.vstack((res, np.load('group_features/{}.npy'.format(group_num))))
        
    return res

In [13]:
# Создает X_train, X_test, train_target

def prepare_data():
    
    X_train = all_group_feature_list(1, 129)
    X_test = all_group_feature_list(130, 309)
    
    d = pd.read_csv('./data/train_groups.csv')
    train_target = d['target']
    
    return X_train, train_target, X_test

In [14]:
def predict(X_train, X_test, train_target, model, scaler=None, **kwargs):
    
    curr_model = model(**kwargs)
    
    if scaler is not None:
        
        your_scaler = scaler()
        your_scaler.fit(X_train)
        X_train = your_scaler.transform(X_train)
        X_test = your_scaler.transform(X_test)
        
    curr_model.fit(X_train, train_target)
    y_pred = curr_model.predict(X_test)
    
    return y_pred

In [40]:
# Протестируем разные модели на валидации

In [30]:
X_train, train_target, X_test = prepare_data()
groups_train = pd.read_csv('data/train_groups.csv')['group_id']

In [31]:
params = {'loss': ['log', 'huber'],
          'alpha': [0.01, 0.001, 0.0001]}

In [32]:
best = validation(X_train, train_target, SGDClassifier, params, group_fold_generator, groups_train=groups_train, folds_num=10)

--------max-------
(0.5030492098127316, {'loss': 'log', 'alpha': 0.0001})


In [33]:
X_train, train_target, X_test = prepare_data()
y_pred = predict(X_train, X_test, train_target, SGDClassifier, **best[1])
data = save_submission(y_pred)

0: 9973, 1: 6654
Your submisson is shit


In [34]:
params = {'algorithm': ['ball_tree', 'kd_tree', 'brute'],
          'n_neighbors': [5, 6, 7, 8, 9, 20],
          'weights': ['uniform', 'distance']}

In [35]:
best = validation(X_train, train_target, KNeighborsClassifier, params, group_fold_generator, groups_train=groups_train, folds_num=43)

--------max-------
(0.6286854505742006, {'algorithm': 'kd_tree', 'n_neighbors': 5, 'weights': 'distance'})


In [36]:
X_train, train_target, X_test = prepare_data()
y_pred = predict(X_train, X_test, train_target, KNeighborsClassifier, **best[1])
data = save_submission(y_pred)
data

0: 12999, 1: 3628
Your submisson is shit


Unnamed: 0,pair_id,target
0,11691,1
1,11692,0
2,11693,0
3,11694,1
4,11695,0
...,...,...
16622,28313,1
16623,28314,0
16624,28315,1
16625,28316,1


In [37]:
params = {'n_estimators': [100, 200, 500],
          'criterion': ['gini', 'entropy'],
          'max_depth': [10, 15, None],
          'n_jobs': [-1]}

In [39]:
best = validation(X_train, train_target, RandomForestClassifier, params, group_fold_generator, groups_train=groups_train, folds_num=10)

--------max-------
(0.639224222609558, {'n_estimators': 200, 'criterion': 'entropy', 'max_depth': None, 'n_jobs': -1})


In [41]:
X_train, train_target, X_test = prepare_data()
y_pred = predict(X_train, X_test, train_target, RandomForestClassifier, **best[1])
data = save_submission(y_pred)
data

0: 13215, 1: 3412


Unnamed: 0,pair_id,target
0,11691,1
1,11692,0
2,11693,0
3,11694,1
4,11695,0
...,...,...
16622,28313,1
16623,28314,0
16624,28315,1
16625,28316,1


In [42]:
params = {'loss': ['log'],
          'alpha': [0.0001, 0.001]}

In [43]:
best = validation(X_train, train_target, SGDClassifier, params, group_fold_generator, groups_train=groups_train, folds_num=10)

--------max-------
(0.5009926131684524, {'loss': 'log', 'alpha': 0.0001})


In [47]:
X_train, train_target, X_test = prepare_data()
y_pred = predict(X_train, X_test, train_target, SGDClassifier, StandardScaler, loss='log')
data = save_submission(y_pred)

There are only 0 in submission
