In [24]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer

In [None]:
data = pd.read_csv('train_data.csv', index_col=0)
data.head()

In [None]:
def calc_bags(data, col_num, min_df=0.1):
    str_anamnez = data[~pd.isnull(data.iloc[:, col_num])].iloc[:, col_num]
    stemmer = RussianStemmer(ignore_stopwords=True)
    analyzer = CountVectorizer().build_analyzer()
    def stemmed_words(doc):
        return (stemmer.stem(w) for w in analyzer(doc))
    vec = CountVectorizer(tokenizer=stemmed_words, min_df=min_df)
    bag_of_words = vec.fit_transform(str_anamnez)
    return pd.DataFrame(np.asarray(bag_of_words.todense()))

In [None]:
new_array = pd.concat([calc_bags(data, 6), data], axis=1)

In [None]:
new_array.head()

In [None]:
data = new_array.fillna(0)

In [None]:
def get_Xy(data):
    X_init = data.iloc[:, :-1]
    y_init = data.iloc[:, -1]
    size_0 = y_init[y_init == 0].shape
    new_index = list(np.random.choice(list(y_init[y_init == 1].index), replace=False, size=size_0)) + list(y_init[y_init == 0].index)
    X = X_init.loc[new_index]
    y = y_init.loc[new_index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X, y, X_train, X_test, y_train, y_test

In [None]:
X, y, X_train, X_test, y_train, y_test = get_Xy(data)

In [None]:
from catboost import CatBoostClassifier
catboost = CatBoostClassifier(learning_rate=0.03, iterations=1000)

In [None]:
cat_features=list(range(26)) + list(range(27, 32))

In [None]:
catboost.fit(X=X_train, y=y_train, eval_set=[X_test, y_test], cat_features=cat_features)

In [None]:
f_imp = catboost.get_feature_importance(X=data.iloc[:, :-1], y=data.iloc[:, -1], cat_features=cat_features)

In [None]:
f_imp

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(test.iloc[:, -1], catboost.predict_proba(test.iloc[:, :-1])[:, 1], pos_label=1)
metrics.auc(fpr, tpr)

In [None]:
from catboost import CatBoostClassifier
catboost = CatBoostClassifier(learning_rate=0.03, iterations=1000)

In [None]:
catboost.fit(X=X, y=y, cat_features=cat_features)

In [None]:
real_test = pd.read_csv('test_data.csv', index_col=0).fillna(0)
real_test.head()

In [None]:
bags = calc_bags(real_test, 6)
new_real_test = pd.concat([bags, real_test], axis=1)

In [None]:
ans = catboost.predict_proba(new_real_test)[:, 1]
ans_pandas = pd.DataFrame(data={'ID': new_real_test.index, 'proba': ans}).reset_index(drop=True)
ans_pandas.to_csv('fool_more_c.csv', index=False)