PART 1

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold, cross_val_score
import datetime

Preparation data

In [2]:
data = pd.read_csv("C:/ML/project/features.csv", index_col='match_id')
y_train = data['radiant_win']
x = data.drop(['duration', 'radiant_win', 'tower_status_radiant', 
               'tower_status_dire', 'barracks_status_radiant', 
               'barracks_status_dire'], axis = 1)
print('columns with NaN:')
for i in x.columns:
    if x[i].hasnans:
        #printing names of column with NaN
        print(i)
        x[i] = x[i].fillna(0)

columns with NaN:
first_blood_time
first_blood_team
first_blood_player1
first_blood_player2
radiant_bottle_time
radiant_courier_time
radiant_flying_courier_time
radiant_first_ward_time
dire_bottle_time
dire_courier_time
dire_flying_courier_time
dire_first_ward_time


Validation

In [64]:
for trees in [10, 20, 30, 40, 50]:
    start_time = datetime.datetime.now()
    model = GradientBoostingClassifier(n_estimators=trees, random_state=17, learning_rate=0.2)
    cv = KFold(len(y_train), n_folds=5, random_state=17, shuffle=True)
    scores = cross_val_score(model, x, y_train, cv=cv, scoring='roc_auc')
    #printing results of validation
    print('n:', trees, 'score', scores.mean(), 'Time:', datetime.datetime.now() - start_time)

n: 10 score 0.678414075423 Time: 0:03:31.908786
n: 20 score 0.692392722997 Time: 0:06:38.853878
n: 30 score 0.69886478311 Time: 0:09:07.724268
n: 40 score 0.702504635693 Time: 0:12:20.934718
n: 50 score 0.705227734735 Time: 0:14:30.780691


PART 2

In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
import datetime
import numpy as np

Preparation data

In [4]:
data = pd.read_csv("C:/ML/project/features.csv", index_col='match_id')
y_train = data['radiant_win']
x = data.drop(['duration', 'radiant_win', 'tower_status_radiant', 
               'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis = 1)
x.fillna(0, inplace = True)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x)

Cross validation

In [67]:
c = 0.0001
while c <=100:
    start_time = datetime.datetime.now()
    model = LogisticRegression(C=c, penalty = 'l2', random_state = 31)
    cv = KFold(len(y_train), n_folds=5, random_state=17, shuffle=True)
    scores = cross_val_score(model, x_train_scaled, y_train, cv=cv, scoring='roc_auc')
    #printing results of validation
    print('C:', c, '  Score:', scores.mean(), 'Time:', datetime.datetime.now() - start_time)
    c *= 10

C: 0.0001   Score: 0.711175170448 Time: 0:00:14.624822
C: 0.001   Score: 0.716138080286 Time: 0:00:24.645288
C: 0.01   Score: 0.716325474946 Time: 0:00:32.295612
C: 0.1   Score: 0.716300460534 Time: 0:00:33.788337
C: 1.0   Score: 0.716296998325 Time: 0:00:32.740246
C: 10.0   Score: 0.716296816332 Time: 0:00:32.956178
C: 100.0   Score: 0.716296764514 Time: 0:00:33.335013


Delete columns and validation

In [5]:
x_new = x.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero',
                'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_new)
c = 0.01
start_time = datetime.datetime.now()
model = LogisticRegression(C=c, penalty = 'l2', random_state = 31)
cv = KFold(len(y_train), n_folds=5, random_state=17, shuffle=True)
scores = cross_val_score(model, x_train_scaled, y_train, cv=cv, scoring='roc_auc')
print('C:', c, '  Score:', scores.mean(), 'Time:', datetime.datetime.now() - start_time)

C: 0.01   Score: 0.716366662587 Time: 0:00:32.523235


Bag of words

In [21]:
data = pd.read_csv("C:/ML/project/features.csv", index_col='match_id')

y_train = data['radiant_win']
x = data.drop(['duration', 'radiant_win', 'tower_status_radiant', 
               'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis = 1)
x.fillna(0, inplace = True)

#searching number of unique heroes
n = []
for i in range(5):
    r = x['r%d_hero' % (i+1)].unique().tolist()
    d = x['d%d_hero' % (i+1)].unique().tolist()
    n = np.concatenate((n, r, d))
n = np.unique(n)
print('unique hero:', len(n))

N_hero = int(n.max()) 
X_pick = np.zeros((x.shape[0], N_hero))
for i, match_id in enumerate(x.index):
    for p in range(5):
        X_pick[i, x.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, x.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

# dataframe with heroes
labels = map(lambda x: 'hero_%d' % (x+1), range(0, X_pick.shape[1]))
x_hero = pd.DataFrame(X_pick, columns=labels)

x_new = x.drop(['lobby_type', 'r1_hero', 'r2_hero', 
                'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 
                'd3_hero', 'd4_hero', 'd5_hero'], axis = 1)
x_new = np.concatenate((x_new, x_hero), axis=1)

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_new)
print('X_new size ', x_new.shape)

unique hero: 108
X_new size  (97230, 203)


Validation with bag of words

In [34]:
c = 0.01
while c <=0.01:
    start_time = datetime.datetime.now()
    model = LogisticRegression(C=c, penalty = 'l2', random_state = 31)
    cv = KFold(len(y_train), n_folds=5, random_state=17, shuffle=True)
    print(len(y_train))
    scores = cross_val_score(model, x_scaled, y_train, cv=cv, scoring='roc_auc')
    print('C:', c, '  Score:', scores.mean(), 'Time:', datetime.datetime.now() - start_time)
    c *= 10

97230
C: 0.01   Score: 0.751885092001 Time: 0:00:53.292816


Fitting model for testing

In [35]:
model.fit(x_scaled, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=31, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Testing with features_test.csv

In [59]:
data_test = pd.read_csv("C:/ML/project/features_test.csv", index_col='match_id')
x = data_test

x.fillna(0, inplace = True)

#searching number of unique heroes

n = []
for i in range(5):
    r = x['r%d_hero' % (i+1)].unique().tolist()
    d = x['d%d_hero' % (i+1)].unique().tolist()
    n = np.concatenate((n, r, d))
n = np.unique(n)

N_hero = int(n.max()) 
X_pick = np.zeros((x.shape[0], N_hero))
for i, match_id in enumerate(x.index):
    for p in range(5):
        X_pick[i, x.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, x.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

# dataframe with heroes
labels = map(lambda x: 'hero_%d' % (x+1), range(0, X_pick.shape[1]))
x_hero = pd.DataFrame(X_pick, columns=labels)

x_new = x.drop(['lobby_type', 'r1_hero', 'r2_hero', 
                'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 
                'd3_hero', 'd4_hero', 'd5_hero'], axis = 1)
x_new = np.concatenate((x_new, x_hero), axis=1)


x_new_scaled = scaler.transform(x_new)

#print('Matrix size ', x)
print('min:', model.predict_proba(x_new_scaled).min())
print('max:', model.predict_proba(x_new_scaled).max())
pred = model.predict_proba(x_new_scaled)
submission = pd.DataFrame({'match_id' : x.index, 'radiant_win' : pred[:,1]})
submission.to_csv('C:/ML/Project/results.csv', index=False)


min: 0.00372237596433
max: 0.996277624036
