In [29]:
import pandas as pd
import numpy as np

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv('train_huge_wv.csv')
X_test = pd.read_csv('test_huge_wv.csv')

In [11]:
X_train_new = X_train[[x for x in X_train.columns if x not in ['Revisit', 'Диагноз', 'Общее состояние', 'аллергия', 'Анамнез заболевания', 'Внешний осмотр']]]
X_test_new = X_test[[x for x in X_test.columns if x not in ['Диагноз', 'Общее состояние', 'аллергия', 'Анамнез заболевания', 'Внешний осмотр']]]

In [13]:
X_train_new = X_train_new.fillna(0.)
X_test_new = X_test_new.fillna(0.)

In [14]:
X_test_new.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
X_train_new.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)

In [15]:
X_train_new = X_train_new.set_index('ID')
X_test_new = X_test_new.set_index('ID')

In [30]:
def easy_pipe(X, y, with_dummies=True):
#     X = df.drop('Revisit', axis = 1)

    # simple one hot encoding
#     X = pd.get_dummies(X)
    
    X_new_scaled = StandardScaler().fit_transform(X, y)
    
    X_train, X_test, y_train, y_test = train_test_split(X_new_scaled, y, test_size=0.2, random_state=13)

    tree = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 14, random_state = 42)
    tree.fit(X_train, y_train)

    feat = X.columns
    best_features = []
    importances = tree.feature_importances_
    ind = np.argsort(importances)[::-1]
    for i in range(X.shape[1]):
        print("%d) %-s %f" % (i + 1, feat[ind[i]], importances[ind[i]]))
        best_features.append(feat[ind[i]])
    
    X_kbest = SelectKBest(f_classif, k='all').fit_transform(X_new_scaled, y)
    X_varth = VarianceThreshold(.95).fit_transform(X_kbest)

    X_train, X_test, try_train, y_test = train_test_split(X_varth, y, test_size=0.2, random_state=13)

    cb_clf = CatBoostClassifier(iterations=100, learning_rate=0.01, eval_metric='AUC', depth=5)
    cb_clf.fit(X_train, y_train)    
    print(roc_auc_score(y_test, cb_clf.predict_proba(X_test)[:,1]))

In [31]:
easy_pipe(X=X_train_new, y=X_train['Revisit'])

1) tfidf_nb_eap_tfidfDiagnoz 0.402970
2) tfidf_nb_eap_tfidfAnamnes 0.112813
3) Возраст 0.062502
4) sent_vec_Анамнез заболевания_272 0.053839
5) sent_vec_Общее состояние_135 0.023096
6) count_chars_nb_eap_аллергия 0.016012
7) Пол 0.013380
8) sent_vec_Диагноз_289 0.011986
9) sent_vec_Диагноз_175 0.009741
10) sent_vec_Диагноз_141 0.008557
11) sent_vec_Диагноз_96 0.007547
12) sent_vec_Общее состояние_11 0.005886
13) tfidf_char_nb_eap_Диагноз 0.005739
14) sent_vec_Диагноз_47 0.004980
15) sent_vec_Анамнез заболевания_112 0.004949
16) sent_vec_Общее состояние_205 0.004552
17) sent_vec_Общее состояние_134 0.004030
18) sent_vec_Диагноз_110 0.003575
19) sent_vec_Диагноз_2 0.003356
20) sent_vec_Диагноз_277 0.003035
21) tfidf_nb_eap_tfidfGeneral 0.002816
22) sent_vec_Диагноз_133 0.002666
23) tfidf_char_nb_eap_Общее состояние 0.002621
24) sent_vec_Диагноз_165 0.002577
25) sent_vec_Анамнез заболевания_122 0.002528
26) sent_vec_Диагноз_10 0.002429
27) sent_vec_Диагноз_261 0.002320
28) sent_vec_Диагно

1167) sent_vec_Внешний осмотр_175 0.000000
1168) sent_vec_Внешний осмотр_176 0.000000
1169) sent_vec_Внешний осмотр_177 0.000000
1170) sent_vec_Внешний осмотр_178 0.000000
1171) sent_vec_Внешний осмотр_179 0.000000
1172) sent_vec_Внешний осмотр_180 0.000000
1173) sent_vec_Внешний осмотр_181 0.000000
1174) sent_vec_Внешний осмотр_182 0.000000
1175) sent_vec_Внешний осмотр_183 0.000000
1176) sent_vec_Внешний осмотр_184 0.000000
1177) sent_vec_Внешний осмотр_185 0.000000
1178) sent_vec_Внешний осмотр_186 0.000000
1179) sent_vec_Внешний осмотр_187 0.000000
1180) sent_vec_Внешний осмотр_188 0.000000
1181) sent_vec_Внешний осмотр_189 0.000000
1182) sent_vec_Внешний осмотр_190 0.000000
1183) sent_vec_Анамнез заболевания_211 0.000000
1184) sent_vec_Анамнез заболевания_208 0.000000
1185) sent_vec_аллергия_59 0.000000
1186) sent_vec_аллергия_184 0.000000
1187) sent_vec_аллергия_176 0.000000
1188) sent_vec_аллергия_177 0.000000
1189) sent_vec_аллергия_178 0.000000
1190) sent_vec_аллергия_179 0.00

0:	learn: 0.7376891	total: 18s	remaining: 29m 43s
1:	learn: 0.7510634	total: 35.3s	remaining: 28m 50s
2:	learn: 0.7517693	total: 51.3s	remaining: 27m 37s
3:	learn: 0.7518301	total: 1m 8s	remaining: 27m 21s
4:	learn: 0.7530295	total: 1m 25s	remaining: 27m 12s
5:	learn: 0.7541507	total: 1m 42s	remaining: 26m 41s
6:	learn: 0.7561022	total: 1m 59s	remaining: 26m 26s
7:	learn: 0.7554415	total: 2m 16s	remaining: 26m 5s
8:	learn: 0.7552412	total: 2m 32s	remaining: 25m 43s
9:	learn: 0.7571613	total: 2m 49s	remaining: 25m 25s
10:	learn: 0.7583233	total: 3m 7s	remaining: 25m 15s
11:	learn: 0.7591131	total: 3m 23s	remaining: 24m 53s
12:	learn: 0.7587608	total: 3m 41s	remaining: 24m 44s
13:	learn: 0.7593191	total: 3m 58s	remaining: 24m 22s
14:	learn: 0.7607090	total: 4m 15s	remaining: 24m 5s
15:	learn: 0.7604061	total: 4m 31s	remaining: 23m 47s
16:	learn: 0.7619725	total: 4m 49s	remaining: 23m 31s
17:	learn: 0.7628011	total: 5m 5s	remaining: 23m 12s
18:	learn: 0.7639587	total: 5m 22s	remaining: 22

KeyboardInterrupt: 

In [None]:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)