In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

%matplotlib inline


In [16]:
data_train = pd.read_csv('linear_train.txt', names=['text', 'class'])

def count_capitals(string):
    return sum(1 for c in string if c.isupper())

def encode(c):
    return ord(c) - ord(u'а')

def preprocess_data(X_raw):
    cap = X_raw.map(lambda s : 1 if count_capitals(s) == 1 and s[0].isupper() else 0)
    end = X_raw.map(lambda s : list(s[-4:].lower()))

    X = pd.DataFrame(end.values.tolist()).fillna('0')
    for i in X.columns:
        X[i] = X[i].apply(encode)
    X[4] = cap
    return X

X_train = preprocess_data(data_train['text'])
y_train = data_train['class']

In [17]:
roc_auc_score = [0]

for depth in range(1, 20):
    classifier = RandomForestClassifier(max_depth=depth)
    score = np.mean(cross_val_score(classifier, X_train, y_train, cv=5, scoring='roc_auc'))
    roc_auc_score.append(score)
    
depth = np.argmax(roc_auc_score)
print(depth)

13


In [18]:
data_test = pd.read_csv('linear_test.txt', names=['text'])

X_test = preprocess_data(data_test['text'])

classifier = RandomForestClassifier(max_depth=depth)
classifier.fit(X_train, y_train)
data_test['class'] = classifier.predict(X_test)


In [19]:
data_test['class'] = data_test['class'].apply(float)
data_test.to_csv('linear_ans.txt', columns=['class'])