In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from random import randint
from sklearn.metrics import accuracy_score
from sklearn import model_selection, metrics
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation



In [2]:
data = pd.read_csv('dataset_conlang.csv')
conlangs = data[data.conlang_type == "['Artistic Language (Artlang)']"].append(
    data[data.conlang_type == "['A priori']"])
conlangs.index = range(0,len(conlangs))
conlangs.language.value_counts()

['English']                                                                                                                    1536
['German (Deutsch)']                                                                                                             66
['French (Fran\xc3\xa7ais)']                                                                                                     63
['Swedish (Svenska)']                                                                                                            59
['Portuguese (Portugu\xc3\xaas)']                                                                                                46
['Dutch (Nederlands)']                                                                                                           39
['Spanish (Espa\xc3\xb1ol)']                                                                                                     39
['Italian (Italiano)']                                                      

In [3]:
interesting_language = conlangs.language.value_counts().index[1:3]

In [4]:
language = {}
language['phonems'] = list()
language['indexes'] = list()
language['classes'] = list()
for i in range(0,len(interesting_language)):
    phonem = conlangs[conlangs.language == interesting_language[i]].conlang_phonems
    index = conlangs[conlangs.language == interesting_language[i]].index
    for j, p in enumerate(phonem):
        p = map(float, p[1:-1].split(', '))
        if sum(p) != 0:                        # проверим есть ли в языке фонемы
            language['phonems'].append(p)
            language['indexes'].append(index[j])
            language['classes'].append(i)
df = pd.DataFrame(data = language['phonems'])
df = df.join(pd.DataFrame(data = language['classes'], columns = ['y']))
df.index = language['indexes']
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,255,256,257,258,259,260,261,262,263,y
81,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
96,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
120,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
155,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
213,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [28]:
empty_columns = list()
for i in df.columns:
    if df[i].value_counts()[0] == len(df):
        empty_columns.append(i)

In [29]:
df = df.drop(empty_columns, 1)

In [30]:
df.y.value_counts()

1    63
0    62
Name: y, dtype: int64

In [31]:
X = df.drop(['y'],1)
Y = df.y
X.index = df.index
Y.index = df.index

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=44)

In [48]:
model = RandomForestClassifier(n_jobs = -1, random_state = 0, n_estimators=1000, max_depth=13)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [49]:
model.score(X_train, y_train)

1.0

In [50]:
model.score(X_test, y_test)

0.68421052631578949

In [53]:
cvs = cross_val_score(model, X, Y, scoring="accuracy", cv = 10)
print round(np.mean(cvs), 2) 
print round(np.std(cvs), 2)

0.67
0.17


In [54]:
cvs = cross_val_score(model, X, Y, scoring="precision", cv = 10)
print round(np.mean(cvs), 2)
print round(np.std(cvs), 2)

0.69
0.17


In [55]:
cvs = cross_val_score(model, X, Y, scoring="recall", cv = 10)
print round(np.mean(cvs), 2)
print round(np.std(cvs), 2)

0.65
0.18


In [64]:
interesting_language = list()
interesting_language.append(conlangs.language.value_counts().index[2])
interesting_language.append(conlangs.language.value_counts().index[4])

In [83]:
def scoring_two_language(first_language, second_language):
    interesting_language = list()
    interesting_language.append(conlangs.language.value_counts().index[first_language])
    interesting_language.append(conlangs.language.value_counts().index[second_language])
    print interesting_language

    language = {}
    language['phonems'] = list()
    language['indexes'] = list()
    language['classes'] = list()
    for i in range(0,len(interesting_language)):
        phonem = conlangs[conlangs.language == interesting_language[i]].conlang_phonems
        index = conlangs[conlangs.language == interesting_language[i]].index
        for j, p in enumerate(phonem):
            p = map(float, p[1:-1].split(', '))
            if sum(p) != 0:                        # проверим есть ли в языке фонемы
                language['phonems'].append(p)
                language['indexes'].append(index[j])
                language['classes'].append(i)
    df = pd.DataFrame(data = language['phonems'])
    df = df.join(pd.DataFrame(data = language['classes'], columns = ['y']))
    df.index = language['indexes']

    empty_columns = list()
    for i in df.columns:
        if df[i].value_counts()[0] == len(df):
            empty_columns.append(i)
    df = df.drop(empty_columns, 1)
    
    if len(df[df.y == 0]) > len(df[df.y == 1]):
        df = df[df.y == 1].append(df[df.y == 0][0:len(df[df.y == 1])])
    else:
        df = df[df.y == 0].append(df[df.y == 1][0:len(df[df.y == 0])])
    print len(df[df.y == 0]), len(df[df.y == 1])
    X = df.drop(['y'],1)
    Y = df.y
    X.index = df.index
    Y.index = df.index
    
    #подбор параметров
    
    forest = RandomForestClassifier(n_jobs = -1, random_state = 44)
    parameter_grid = {
        'n_estimators' : [100, 150, 300, 500],
        'max_depth' : [5, 7, 9],
        'min_samples_split' : [2, 4],
        'min_samples_leaf' : [1, 2],
    }
    
    grid_search = GridSearchCV(forest, parameter_grid, scoring='accuracy', 
                    cv =  cross_validation.StratifiedShuffleSplit(Y, n_iter = 5, test_size = 0.3, random_state=0))
    grid_search.fit(X, Y)
    best_params = grid_search.best_params_
    print best_params
    
    model = RandomForestClassifier(n_jobs = -1, random_state = 44, **best_params)
    cvs = cross_val_score(model, X, Y, scoring="accuracy", cv = 5)
    print "accuracy"
    print round(np.mean(cvs), 2), round(np.std(cvs), 2)
    cvs = cross_val_score(model, X, Y, scoring="precision", cv = 5)
    print "precision"
    print round(np.mean(cvs), 2), round(np.std(cvs), 2)
    cvs = cross_val_score(model, X, Y, scoring="recall", cv = 5)
    print "recall"
    print round(np.mean(cvs), 2), round(np.std(cvs), 2)
    print '---------------'

In [None]:
уравновесить классы разобраться с кросс вал скор

In [84]:
%%time
for i in range(0, 25):
    for j in range(i, 25):
        if i != j:
            scoring_two_language(i, j)

["['English']", "['German (Deutsch)']"]
62 62
{'min_samples_split': 2, 'n_estimators': 100, 'max_depth': 7, 'min_samples_leaf': 2}
accuracy
0.59 0.09
precision
0.57 0.07
recall
0.68 0.2
---------------
["['English']", "['French (Fran\\xc3\\xa7ais)']"]
63 63
{'min_samples_split': 2, 'n_estimators': 300, 'max_depth': 7, 'min_samples_leaf': 2}
accuracy
0.54 0.1
precision
0.54 0.1
recall
0.53 0.23
---------------
["['English']", "['Swedish (Svenska)']"]
57 57
{'min_samples_split': 2, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 2}
accuracy
0.6 0.07
precision
0.59 0.06
recall
0.68 0.15
---------------
["['English']", "['Portuguese (Portugu\\xc3\\xaas)']"]
39 39
{'min_samples_split': 4, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}
accuracy
0.63 0.09
precision
0.62 0.08
recall
0.62 0.21
---------------
["['English']", "['Dutch (Nederlands)']"]
38 38
{'min_samples_split': 2, 'n_estimators': 150, 'max_depth': 9, 'min_samples_leaf': 1}
accuracy
0.67 0.12
precision
0.71

  'precision', 'predicted', average, warn_for)


precision
0.57 0.33
recall
0.63 0.37
---------------
["['English']", "['Slovenian (Sloven\\xc5\\xa1\\xc4\\x8dina)']"]
9 9
{'min_samples_split': 2, 'n_estimators': 500, 'max_depth': 5, 'min_samples_leaf': 1}
accuracy
0.75 0.22
precision
0.63 0.37
recall
0.8 0.4
---------------
["['English']", "['Chechen (\\xd0\\x9d\\xd0\\xbe\\xd1\\x85\\xd1\\x87\\xd0\\xb8\\xd0\\xb9\\xd0\\xbd \\xd0\\xbc\\xd0\\xbe\\xd1\\x82\\xd1\\x82)']"]
9 9
{'min_samples_split': 2, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}
accuracy
0.85 0.2
precision
0.83 0.21
recall
1.0 0.0
---------------
["['English']", "['Norwegian (N) (Norsk (Nynorsk))', 'Norwegian (B) (Norsk (Bokm\\xc3\\xa5l))']"]
9 9
{'min_samples_split': 2, 'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 1}
accuracy
0.5 0.16
precision
0.5 0.11
recall
0.9 0.2
---------------
["['English']", "['Catalan (Catal\\xc3\\xa0)', 'Spanish (Espa\\xc3\\xb1ol)']"]
8 8
{'min_samples_split': 2, 'n_estimators': 150, 'max_depth': 5, 'min_samples_leaf': 

KeyboardInterrupt: 