In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

%matplotlib inline


In [2]:
data_train = pd.read_csv('linear_train.txt', names=['text', 'class'])

def count_capitals(string):
    return sum(1 for c in string if c.isupper())

def encode(c):
    return ord(c) - ord(u'а')

SUF_SIZE = 4

def preprocess_data(X_raw):
    cap = X_raw.map(lambda s: 1 if count_capitals(s) == 1 and s[0].isupper() else 0)
    end = X_raw.map(lambda s: list(s[-SUF_SIZE:].lower()))

    X = pd.DataFrame(end.values.tolist()).fillna('0')

    for i in X.columns:
        X[i] = X[i].apply(encode)
    X[SUF_SIZE] = cap
    return X

X_train = preprocess_data(data_train['text'])
y_train = data_train['class']

X_train.head(8)

Unnamed: 0,0,1,2,3,4
0,14,13,5,13,1
1,0,0,16,-1024,1
2,0,16,14,13,1
3,0,16,14,13,0
4,16,14,13,0,1
5,16,14,13,0,1
6,16,14,13,5,1
7,14,13,14,2,1


In [3]:
params = {'n_estimators': range(60, 65), 'max_depth': range(10, 15)}
clf = GridSearchCV(RandomForestRegressor(), params, scoring='roc_auc')
clf.fit(X_train, y_train)

print(clf.best_score_, clf.best_params_)

0.88234366226 {'max_depth': 13, 'n_estimators': 62}


In [5]:
data_test = pd.read_csv('linear_test.txt', names=['text'])
X_test = preprocess_data(data_test['text'])

data_test['class'] = clf.predict(X_test)

data_test[data_test['class'] > 0.9].head(20)

Unnamed: 0,text,class
19,Абаева,0.927241
20,Абаевым,1.0
41,Абакумова,0.953052
44,Абакумовым,0.991528
50,Аббасова,0.942386
51,Аббасову,0.903753
103,Абдрахманова,0.952595
112,Абдуллаева,0.927241
115,Абдуллаевым,1.0
132,Абелевым,1.0


In [15]:
data_test['class'] = data_test['class'].apply(lambda x: round(x, 2))
data_test.to_csv('linear_ans.txt', columns=['class'])