In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from random import randint
from sklearn.metrics import accuracy_score
from sklearn import model_selection, metrics
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation



In [3]:
data = pd.read_csv('dataset_conlang.csv')
conlangs = data[data.conlang_type == "['Artistic Language (Artlang)']"].append(
    data[data.conlang_type == "['A priori']"])
conlangs.index = range(0,len(conlangs))
conlangs.language.value_counts()

['English']                                                                                                                    1536
['German (Deutsch)']                                                                                                             66
['French (Fran\xc3\xa7ais)']                                                                                                     63
['Swedish (Svenska)']                                                                                                            59
['Portuguese (Portugu\xc3\xaas)']                                                                                                46
['Dutch (Nederlands)']                                                                                                           39
['Spanish (Espa\xc3\xb1ol)']                                                                                                     39
['Italian (Italiano)']                                                      

In [18]:
def scoring_two_language(first_language, second_language):
    interesting_language = list()
    interesting_language.append(conlangs.language.value_counts().index[first_language])
    interesting_language.append(conlangs.language.value_counts().index[second_language])
    print interesting_language

    language = {}
    language['phonems'] = list()
    language['indexes'] = list()
    language['classes'] = list()
    for i in range(0,len(interesting_language)):
        phonem = conlangs[conlangs.language == interesting_language[i]].conlang_phonems
        index = conlangs[conlangs.language == interesting_language[i]].index
        for j, p in enumerate(phonem):
            p = map(float, p[1:-1].split(', '))
            if sum(p) != 0:                        # проверим есть ли в языке фонемы
                language['phonems'].append(p)
                language['indexes'].append(index[j])
                language['classes'].append(i)
    df = pd.DataFrame(data = language['phonems'])
    df = df.join(pd.DataFrame(data = language['classes'], columns = ['y']))
    df.index = language['indexes']

    empty_columns = list()
    for i in df.columns:
        if df[i].value_counts()[0] == len(df):
            empty_columns.append(i)
    df = df.drop(empty_columns, 1)
    
    if len(df[df.y == 0]) > len(df[df.y == 1]):
        df = df[df.y == 1].append(df[df.y == 0][0:len(df[df.y == 1])])
    else:
        df = df[df.y == 0].append(df[df.y == 1][0:len(df[df.y == 0])])
    print len(df[df.y == 0]), len(df[df.y == 1])
    X = df.drop(['y'],1)
    Y = df.y
    X.index = df.index
    Y.index = df.index
    
    #подбор параметров
    
    forest = RandomForestClassifier(n_jobs = -1, random_state = 44)
    parameter_grid = {
        'n_estimators' : [100, 150, 300, 500],
        'max_depth' : [5, 7, 9],
        'min_samples_split' : [2, 4],
        'min_samples_leaf' : [1, 2],
    }
    
    grid_search = GridSearchCV(forest, parameter_grid, scoring='accuracy', 
                    cv =  cross_validation.StratifiedShuffleSplit(Y, n_iter = 5, test_size = 0.3, random_state=0))
    grid_search.fit(X, Y)
    best_params = grid_search.best_params_
    print best_params
    
    model = RandomForestClassifier(n_jobs = -1, random_state = 44, **best_params)
    cvs = cross_val_score(model, X, Y, scoring="accuracy", 
                          cv = cross_validation.StratifiedShuffleSplit(Y, n_iter = 5, test_size = 0.3, random_state=0))
    print "accuracy"
    print round(np.mean(cvs), 2), round(np.std(cvs), 2)
    cvs = cross_val_score(model, X, Y, scoring="precision", 
                          cv = cross_validation.StratifiedShuffleSplit(Y, n_iter = 5, test_size = 0.3, random_state=0))
    print "precision"
    print round(np.mean(cvs), 2), round(np.std(cvs), 2)
    cvs = cross_val_score(model, X, Y, scoring="recall", 
                          cv = cross_validation.StratifiedShuffleSplit(Y, n_iter = 5, test_size = 0.3, random_state=0))
    print "recall"
    print round(np.mean(cvs), 2), round(np.std(cvs), 2)
    print '---------------'

In [None]:
%%time
for i in range(0, 25):
    for j in range(i, 25):
        if i != j:
            scoring_two_language(i, j)

["['English']", "['German (Deutsch)']"]
62 62
{'min_samples_split': 2, 'n_estimators': 100, 'max_depth': 7, 'min_samples_leaf': 2}
accuracy
0.6 0.08
precision
0.6 0.09
recall
0.63 0.11
---------------
["['English']", "['French (Fran\\xc3\\xa7ais)']"]
63 63
{'min_samples_split': 2, 'n_estimators': 300, 'max_depth': 7, 'min_samples_leaf': 2}
accuracy
0.66 0.06
precision
0.65 0.06
recall
0.72 0.15
---------------
["['English']", "['Swedish (Svenska)']"]
57 57
{'min_samples_split': 2, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 2}
accuracy
0.61 0.07
precision
0.59 0.07
recall
0.69 0.04
---------------
["['English']", "['Portuguese (Portugu\\xc3\\xaas)']"]
39 39
{'min_samples_split': 4, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}
accuracy
0.66 0.09
precision
0.65 0.1
recall
0.72 0.04
---------------
["['English']", "['Dutch (Nederlands)']"]
38 38
{'min_samples_split': 2, 'n_estimators': 150, 'max_depth': 9, 'min_samples_leaf': 1}
accuracy
0.71 0.02
precision
0.7

  'precision', 'predicted', average, warn_for)


precision
0.13 0.16
recall
0.4 0.49
---------------
["['English']", "['Turkish (T\\xc3\\xbcrk\\xc3\\xa7e)']"]
6 6
{'min_samples_split': 4, 'n_estimators': 150, 'max_depth': 5, 'min_samples_leaf': 1}
accuracy
0.95 0.1
precision
0.93 0.13
recall
1.0 0.0
---------------
["['English']", "['English', 'Finnish (Suomi)']"]
7 7
{'min_samples_split': 2, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}
accuracy
0.8 0.13
precision
0.7 0.16
recall
0.9 0.2
---------------
["['German (Deutsch)']", "['French (Fran\\xc3\\xa7ais)']"]
62 62
{'min_samples_split': 4, 'n_estimators': 300, 'max_depth': 5, 'min_samples_leaf': 1}
accuracy
0.7 0.06
precision
0.68 0.06
recall
0.77 0.05
---------------
["['German (Deutsch)']", "['Swedish (Svenska)']"]
57 57
{'min_samples_split': 4, 'n_estimators': 300, 'max_depth': 9, 'min_samples_leaf': 1}
accuracy
0.61 0.07
precision
0.6 0.06
recall
0.64 0.15
---------------
["['German (Deutsch)']", "['Portuguese (Portugu\\xc3\\xaas)']"]
39 39
{'min_samples_split': 