In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline




In [2]:
from sklearn.calibration import calibration_curve
from sklearn.cross_validation import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.grid_search import GridSearchCV

from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
import operator

In [4]:
def fillMissingData(df):
    table = df.count()/df.shape[0]
    i = 0
    for x in table:
        if x < 1.0: 
            if 'time' in df.columns.values[i]:
                df[df.columns.values[i]].fillna(np.max(df[df.columns.values[i]])*1.1, inplace=True)
            else:
                df[df.columns.values[i]].fillna(df[df.columns.values[i]].mean(), inplace=True)
        i = i + 1
    return df

In [5]:
def prepareData(q):
    df = pd.read_csv('./data/features_collection.csv', index_col='match_id')
    y = df.radiant_win.values[:]
    #df.fillna(600, inplace=True) # like 10 min after start
    df = fillMissingData(df)
    df.drop(['start_time', 'lobby_type', 'duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis = 1, inplace=True)
    
    if 'gold' in q:
        df['r_gold'] = df[['r1_gold', 'r2_gold', 'r3_gold', 'r4_gold', 'r5_gold']].mean(axis = 1)
        df['d_gold'] = df[['d1_gold', 'd2_gold', 'd3_gold', 'd4_gold', 'd5_gold']].mean(axis = 1)
    
    if 'level' in q:
        df['r_level'] = df[['r1_level', 'r2_level', 'r3_level', 'r4_level', 'r5_level']].mean(axis = 1)
        df['d_level'] = df[['d1_level', 'd2_level', 'd3_level', 'd4_level', 'd5_level']].mean(axis = 1)

    if 'xp' in q:
        df['r_xp'] = df[['r1_xp', 'r2_xp', 'r3_xp', 'r4_xp', 'r5_xp']].mean(axis = 1)
        df['d_xp'] = df[['d1_xp', 'd2_xp', 'd3_xp', 'd4_xp', 'd5_xp']].mean(axis = 1)

    if 'lh' in q:
        df['r_lh'] = df[['r1_lh', 'r2_lh', 'r3_lh', 'r4_lh', 'r5_lh']].mean(axis = 1)
        df['d_lh'] = df[['d1_lh', 'd2_lh', 'd3_lh', 'd4_lh', 'd5_lh']].mean(axis = 1)

    if 'kills' in q:
        df['r_kills'] = df[['r1_kills', 'r2_kills', 'r3_kills', 'r4_kills', 'r5_kills']].mean(axis = 1)
        df['d_kills'] = df[['d1_kills', 'd2_kills', 'd3_kills', 'd4_kills', 'd5_kills']].mean(axis = 1)

    if 'deaths' in q:
        df['r_deaths'] = df[['r1_deaths', 'r2_deaths', 'r3_deaths', 'r4_deaths', 'r5_deaths']].mean(axis = 1)
        df['d_deaths'] = df[['d1_deaths', 'd2_deaths', 'd3_deaths', 'd4_deaths', 'd5_deaths']].mean(axis = 1)

    if 'items' in q:
        df['r_items'] = df[['r1_items', 'r2_items', 'r3_items', 'r4_items', 'r5_items']].mean(axis = 1)
        df['d_items'] = df[['d1_items', 'd2_items', 'd3_items', 'd4_items', 'd5_items']].mean(axis = 1)
        
    for p in xrange(5):
        df.drop(['r%d_gold' % (p+1),'r%d_level' % (p+1), 'r%d_xp' % (p+1), 'r%d_lh' % (p+1), 
                 'r%d_kills' % (p+1), 'r%d_deaths' % (p+1), 'r%d_items' % (p+1)], axis = 1, inplace=True)
        df.drop(['d%d_gold' % (p+1),'d%d_level' % (p+1), 'd%d_xp' % (p+1), 'd%d_lh' % (p+1), 
                 'd%d_kills' % (p+1), 'd%d_deaths' % (p+1), 'd%d_items' % (p+1)], axis = 1, inplace=True)
    df.drop(['first_blood_player1', 'first_blood_player2'], axis = 1, inplace=True)
    return (df, y)

In [6]:
def binaryHeroVectorize(data):
    df_heroes_list = pd.read_csv('./data/dictionaries/heroes.csv')
    
    dataPick = pd.DataFrame()
    for name in df_heroes_list['name']:
        dataPick[name] = np.zeros(data.shape[0])
    dataPick.index = data.index

    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            heroName = df_heroes_list['name'][np.where(df_heroes_list['id'].values[:] == data.ix[match_id, 'r%d_hero' % (p+1)])[0][0]]
            dataPick.ix[match_id, heroName] = dataPick.ix[match_id, heroName] + 1
            heroName = df_heroes_list['name'][np.where(df_heroes_list['id'].values[:] == data.ix[match_id, 'd%d_hero' % (p+1)])[0][0]]
            dataPick.ix[match_id, heroName] = dataPick.ix[match_id, heroName] - 1
    
    res = data.drop(['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1)
    
    return res, dataPick

In [7]:
def binaryItemVectorize(df):
    df_items_list = pd.read_csv('./data/dictionaries/items.csv')
    
    dataPick = pd.DataFrame()
    for name in df_items_list['name']:
        dataPick[name] = np.zeros((df.shape[0]))
    dataPick.index = df.index
        
    for i, match_id in enumerate(df.index):
        for p in xrange(5):
            for item_id in df.ix[match_id, 'r%d_items_collection' % (p+1)].replace("[", "").replace("]", "").replace(" ", "").split(","):
                if isinstance( item_id, int ):
                    itemName = df_items_list['name'][np.where(df_items_list['id'].values[:] == int(item_id))[0][0]]
                    dataPick.ix[match_id, itemName] = dataPick.ix[match_id, itemName] + 1
            for item_id in df.ix[match_id, 'd%d_items_collection' % (p+1)].replace("[", "").replace("]", "").split(","):
                if isinstance( item_id, int ):
                    itemName = df_items_list['name'][np.where(df_items_list['id'].values[:] == int(item_id))[0][0]]
                    dataPick.ix[match_id, itemName] = dataPick.ix[match_id, itemName] - 1    
    
    res = df.drop(['r1_items_collection', 'r2_items_collection', 'r3_items_collection', 'r4_items_collection', 'r5_items_collection', 
                     'd1_items_collection', 'd2_items_collection', 'd3_items_collection', 'd4_items_collection', 'd5_items_collection'], 
                        axis = 1)
    
    return res, dataPick

In [8]:
def binaryAbilityVectorize(df):
    df_abilities_list = pd.read_csv('./data/dictionaries/abilities.csv')

    dataPick = pd.DataFrame()
    for name in df_abilities_list['name']:
        dataPick[name] = np.zeros((df.shape[0]))
    dataPick.index = df.index
        
    for i, match_id in enumerate(df.index):
        for p in xrange(5):
            for item_id in df.ix[match_id, 'r%d_ability_upgrades_collection' % (p+1)].replace("[", "").replace("]", "").replace(" ", "").split(","):
                if isinstance( item_id, int ):
                    itemName = df_abilities_list['name'][np.where(df_abilities_list['id'].values[:] == int(item_id))[0][0]]
                    dataPick.ix[match_id, itemName] = dataPick.ix[match_id, itemName] + 1
            for item_id in df.ix[match_id, 'd%d_ability_upgrades_collection' % (p+1)].replace("[", "").replace("]", "").split(","):
                if isinstance( item_id, int ):
                    itemName = df_abilities_list['name'][np.where(df_abilities_list['id'].values[:] == int(item_id))[0][0]]
                    dataPick.ix[match_id, itemName] = dataPick.ix[match_id, itemName] - 1
    
    res = df.drop(['r1_ability_upgrades_collection', 'r2_ability_upgrades_collection',
                     'r3_ability_upgrades_collection', 'r4_ability_upgrades_collection', 
                     'r5_ability_upgrades_collection', 
                     'd1_ability_upgrades_collection', 'd2_ability_upgrades_collection',
                     'd3_ability_upgrades_collection', 'd4_ability_upgrades_collection',
                     'd5_ability_upgrades_collection'], axis = 1)
    
    return res, dataPick

In [9]:
# def prepareScaleData(q, scale=True):
#     (data, train_labels) = prepareData(q)
#     data2, data_heroes = binaryHeroVectorize(data)
#     data3, data_items = binaryItemVectorize(data2)
#     train_data, data_abilities = binaryAbilityVectorize(data3)
#     y = train_labels
# #     X = np.hstack((train_data.values[:,:], data_heroes))
# #     X = np.hstack((X, data_items))
# #     X = np.hstack((X, data_abilities))
#     if scale:
#         scaler = StandardScaler()
#         X_scale = scaler.fit_transform(X)
#         return (X_scale, y, scaler)
#     else:
#         return (X, y)

In [10]:
def prepareDataframe(q):
    data, train_labels = prepareData(q)
    data2, data_heroes = binaryHeroVectorize(data)
    data3, data_items = binaryItemVectorize(data2)
    train_data, data_abilities = binaryAbilityVectorize(data3)
    df_new = pd.concat([train_data, data_heroes, data_items, data_abilities], axis=1)
    df_new['win'] = train_labels
    return df_new

In [11]:
qFeatures = ['gold', 'level', 'xp', 'lh', 'kills', 'deaths', 'items']

In [12]:
# qqFeatures = []
# from itertools import combinations
# for i in range(7):
#     for x in combinations(qFeatures, i+1):
#         qqFeatures.append(x)

In [13]:
# qScore = {}

In [14]:
# currentQ = []
# for q in qqFeatures:
#     (X, y, scaler) = prepareScaleData(q)
#     lr = LogisticRegression()
#     gridLogistic = {'C': np.power(10.0, np.arange(-5, 6)), 'penalty':('l1', 'l2')}
#     kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=42)
#     gs = GridSearchCV(lr, gridLogistic, scoring='roc_auc', cv=kf, verbose=0, n_jobs=2)
#     gs.fit(X, y)
#     for a in gs.grid_scores_:
#         qScore["(" + str(q) + "; " + str(a.parameters['C']) + "; " + str(a.parameters['penalty']) + ")"] = a.mean_validation_score

In [15]:
# qScoreSorted = sorted(qScore.items(), key=operator.itemgetter(1), reverse=True)

In [16]:
# for x in qScoreSorted:
#     print "par:", x[0], ', roc: ', x[1]

In [17]:
#new_dataframe = prepareDataframe(qFeatures)

In [29]:
#for i in range(9):
#    new_dataframe[10000*i:10000*(i + 1)].to_csv('./data/new_df{}_Index.csv'.format(i), 
#                                                sep=',', 
#                                                encoding='utf-8', 
#                                                header=True, 
#                                                index=True)

In [12]:
df = pd.read_csv('./data/new_df0_Index.csv', index_col='match_id')

In [13]:
X = df.drop('win', axis=1).values
y = df['win'].values

In [14]:
poly = PolynomialFeatures(2,include_bias=False)
X_poly = poly.fit_transform(X[:,:31])

In [15]:
# new_dataframe.to_csv('./data/new_dataframe_noIndex.csv', sep=',', encoding='utf-8', header=True, index=False)

In [16]:
# df_to_write.to_csv('./data/featureCsv.csv', sep=',', encoding='utf-8', header=False, index=False)

In [17]:
score = {}
lr = LogisticRegression()
gridLogistic = {'C': np.power(10.0, np.arange(-5, 6)), 'penalty':('l1', 'l2'), 'fit_intercept':(True, False)}
kf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=42)
gs = GridSearchCV(lr, gridLogistic, scoring='roc_auc', cv=kf, verbose=0, n_jobs=4)

In [None]:
gs.fit(np.hstack((X,X_poly)), y)

In [None]:
# нужна лишь часть выборки, иначе - жесть