In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

df_train_features = pd.read_csv('train_features.csv', index_col='match_id_hash')
df_train_targets = pd.read_csv('train_targets.csv', index_col='match_id_hash')

In [2]:
df_train_features.shape

(39675, 245)

In [3]:
df_train_features.head()

Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,...,d5_stuns,d5_creeps_stacked,d5_camps_stacked,d5_rune_pickups,d5_firstblood_claimed,d5_teamfight_participation,d5_towers_killed,d5_roshans_killed,d5_obs_placed,d5_sen_placed
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,155,22,7,1,11,11,0,0,0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,658,4,0,3,10,15,7,2,0,7,...,0.0,0,0,0,0,0.0,0,0,0,0
6db558535151ea18ca70a6892197db41,21,23,0,0,0,101,0,0,0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,576,22,7,1,4,14,1,0,3,1,...,8.664527,3,1,3,0,0.0,0,0,2,0
b1b35ff97723d9b7ade1c9c3cf48f770,453,22,7,1,3,42,0,1,1,0,...,0.0,2,1,2,0,0.25,0,0,0,0


In [4]:
df_train_targets.head()

Unnamed: 0_level_0,game_time,radiant_win,duration,time_remaining,next_roshan_team
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a400b8f29dece5f4d266f49f1ae2e98a,155,False,992,837,
b9c57c450ce74a2af79c9ce96fac144d,658,True,1154,496,
6db558535151ea18ca70a6892197db41,21,True,1503,1482,Radiant
46a0ddce8f7ed2a8d9bd5edcbb925682,576,True,1952,1376,
b1b35ff97723d9b7ade1c9c3cf48f770,453,False,2001,1548,


In [5]:
df_train_targets['radiant_win'] = df_train_targets['radiant_win'].astype(int)
df_train_targets.head()

Unnamed: 0_level_0,game_time,radiant_win,duration,time_remaining,next_roshan_team
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a400b8f29dece5f4d266f49f1ae2e98a,155,0,992,837,
b9c57c450ce74a2af79c9ce96fac144d,658,1,1154,496,
6db558535151ea18ca70a6892197db41,21,1,1503,1482,Radiant
46a0ddce8f7ed2a8d9bd5edcbb925682,576,1,1952,1376,
b1b35ff97723d9b7ade1c9c3cf48f770,453,0,2001,1548,


In [6]:
X = df_train_features.values
y = df_train_targets['radiant_win'].values

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

print("Jumlah data pelatihan: ", len(X_train))
print("Jumlah data pengujian: ", len(X_test))

Jumlah data pelatihan:  31740
Jumlah data pengujian:  7935


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
%%time
model_LR = LogisticRegression(max_iter=200)

model_LR.fit(X_train_scaled, y_train)

print("Koefisien regresi: ", model_LR.coef_)
print("Intersep: ", model_LR.intercept_)

Koefisien regresi:  [[-9.06776163e-02 -4.79707123e-03  1.19808008e-02  2.99241924e-02
  -2.48113108e-02 -6.98972898e-03 -2.11307252e-02 -9.58575344e-02
  -4.58327356e-02  4.08799766e-02  6.30943725e-01 -6.20080347e-02
  -1.54998376e-01  3.44550510e-02  6.33319561e-02  3.88599730e-02
   8.36262388e-01  1.68672192e-02  4.48545323e-02  2.81258804e-02
   1.29552804e-02 -9.50171441e-03  1.70865966e-02 -2.05588487e-02
   5.80202760e-02  7.58711276e-02 -2.58981473e-02  3.36973076e-02
  -2.15854382e-02 -1.67905054e-02  3.43705139e-02 -2.77886783e-02
   7.27066841e-02  6.57852395e-02  7.56824565e-01 -4.74877067e-02
  -1.85586620e-01  5.65465282e-02  4.91711824e-02  1.31028216e-02
   7.54173827e-01  2.11178195e-02  4.33733591e-02 -2.45154846e-02
   4.19636050e-02 -1.00633587e-02  5.63184707e-02 -4.76113008e-03
   5.67942213e-03  3.94601647e-02 -7.23067983e-03  4.93590047e-02
  -2.40594862e-02  3.91221421e-04  7.74356245e-02 -2.71752139e-02
   1.04906435e-02  6.13611880e-02  4.76815043e-01  2.384

In [10]:
LRpredictions = model_LR.predict(X_test_scaled)
y_pred = model_LR.predict_proba(X_test_scaled)[:, 1]
mae_LR = mean_absolute_error(y_test, LRpredictions)
mse_LR = mean_squared_error(y_test, LRpredictions)
r2_LR = r2_score(y_test, LRpredictions)
print("Mean Absolute Error:", mae_LR)
print("Mean Squared Error:", mse_LR)
print("R-squared:", r2_LR)
y_pred

Mean Absolute Error: 0.277882797731569
Mean Squared Error: 0.277882797731569
R-squared: -0.1134220600262541


array([5.26706608e-04, 5.18992343e-01, 3.87977017e-01, ...,
       5.75114672e-02, 2.65130525e-01, 5.70958655e-01])

In [11]:
valid_accuracy = accuracy_score(y_test, y_pred > 0.5)
print('Validation accuracy of P>0.5 classifier:', valid_accuracy)

Validation accuracy of P>0.5 classifier: 0.722117202268431


In [12]:
confusion_matrix(y_test, y_pred > 0.5)

array([[2509, 1295],
       [ 910, 3221]], dtype=int64)

In [13]:
precision_score(y_test, y_pred > 0.5)

0.7132418069087688

In [14]:
recall_score(y_test, y_pred > 0.5)

0.7797143548777535

In [15]:
f1_score(y_test, y_pred > 0.5)

0.7449982652943218

In [16]:
%%time
model_RFC = RandomForestClassifier(n_estimators=100, max_features=20,max_depth=60,min_samples_split=20,min_samples_leaf=20,n_jobs=-1, random_state=17, oob_score=True)
model_RFC.fit(X_train, y_train)

CPU times: total: 23.9 s
Wall time: 4.01 s


In [17]:
RFCpredictions = model_RFC.predict(X_test)
y_pred = model_RFC.predict_proba(X_test)[:, 1]
mae_RFR = mean_absolute_error(y_test, RFCpredictions)
mse_RFR = mean_squared_error(y_test, RFCpredictions)
r2_RFR = r2_score(y_test, RFCpredictions)
oob_score = model_RFC.oob_score_
print("Mean Absolute Error:", mae_RFR)
print("Mean Squared Error:", mse_RFR)
print("R-squared:", r2_RFR)
print(f'Out-of-Bag Score: {oob_score}')
y_pred

Mean Absolute Error: 0.30031505986137363
Mean Squared Error: 0.30031505986137363
R-squared: -0.20330375013268176
Out-of-Bag Score: 0.6953371140516699


array([0.06234798, 0.51205757, 0.44171135, ..., 0.2261908 , 0.32540086,
       0.55318263])

In [18]:
valid_accuracy = accuracy_score(y_test, y_pred > 0.5)
print('Validation accuracy of P>0.5 classifier:', valid_accuracy)

Validation accuracy of P>0.5 classifier: 0.6996849401386264


In [19]:
confusion_matrix(y_test, y_pred > 0.5)

array([[2271, 1533],
       [ 850, 3281]], dtype=int64)

In [20]:
precision_score(y_test, y_pred > 0.5)

0.6815538014125467

In [21]:
recall_score(y_test, y_pred > 0.5)

0.7942386831275721

In [22]:
f1_score(y_test, y_pred > 0.5)

0.7335941866964785

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df_test_features = pd.read_csv('test_features.csv', index_col='match_id_hash')

X_test = df_test_features.values
X_test_scaled = scaler.fit_transform(X)

y_test_pred = model_LR.predict_proba(X_test_scaled)[:, 1]

df_submissionLR = pd.DataFrame({'radiant_win_prob': y_test_pred[:len(df_test_features)]}, index=df_test_features.index)

X_test = df_test_features.values
y_test_pred = model_RFC.predict_proba(X_test)[:, 1]

df_submissionRFC = pd.DataFrame({'radiant_win_prob': y_test_pred[:len(df_test_features)]}, index=df_test_features.index)

In [24]:
df_submissionLR.head()

Unnamed: 0_level_0,radiant_win_prob
match_id_hash,Unnamed: 1_level_1
30cc2d778dca82f2edb568ce9b585caa,0.265867
70e5ba30f367cea48793b9003fab9d38,0.862875
4d9ef74d3a2025d79e9423105fd73d41,0.580012
2bb79e0c1eaac1608e5a09c8e0c6a555,0.764242
bec17f099b01d67edc82dfb5ce735a43,0.545213


In [25]:
df_submissionRFC.head()

Unnamed: 0_level_0,radiant_win_prob
match_id_hash,Unnamed: 1_level_1
30cc2d778dca82f2edb568ce9b585caa,0.484771
70e5ba30f367cea48793b9003fab9d38,0.878262
4d9ef74d3a2025d79e9423105fd73d41,0.703309
2bb79e0c1eaac1608e5a09c8e0c6a555,0.60156
bec17f099b01d67edc82dfb5ce735a43,0.491609


In [26]:
import pickle

with open('model_LoR.pkl', 'wb') as f:
    pickle.dump(model_LR, f)

with open('model_RFC.pkl', 'wb') as f:
    pickle.dump(model_RFC, f)
