In [1]:
# All project packages imported at the start

# Project packages
import pandas as pd
import numpy as np

# Visualisations
import matplotlib.pyplot as plt 
import seaborn as sns

# Statistics
from scipy import stats
from scipy.stats import norm, skew
from statistics import mode

# Machine Learning
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso, Ridge, RidgeCV, ElasticNet


from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge


    
train = pd.read_csv('train.csv')
    
test = pd.read_csv('test.csv')



In [6]:
train = train[:5000]

In [18]:
train_ID = train['soldierId']
test_ID = test['soldierId']

train.drop("soldierId", axis = 1, inplace = True)
test.drop("soldierId", axis = 1, inplace = True)



train.drop("shipId", axis = 1, inplace = True)
test.drop("shipId", axis = 1, inplace = True)
train.drop("attackId", axis = 1, inplace = True)
test.drop("attackId", axis = 1, inplace = True)

In [19]:
train['knockedOutSoldiers'].fillna(train['knockedOutSoldiers'].mode()[0], inplace=True)
train['horseRideDistance'].fillna(train['horseRideDistance'].mode()[0], inplace=True)
train['respectEarned'].fillna(train['respectEarned'].mode()[0], inplace=True)

# Watchout for null features

In [20]:
train.isnull().sum()

assists                0
greekFireItems         0
healthLost             0
knockedOutSoldiers     0
throatSlits            0
healingPotionsUsed     0
killRank               0
killPoints             0
enemiesKilled          0
killingStreaks         0
farthermostKill        0
numShips               0
numSaves               0
horseRideDistance      0
horseRideKills         0
swimmingDistance       0
friendlyKills          0
castleTowerDestroys    0
onFootDistance         0
weaponsUsed            0
respectEarned          0
bestSoldierPerc        0
dtype: int64

# Preparing X and y  dataset

In [21]:
x = train.drop(['bestSoldierPerc',],1)
y = train.bestSoldierPerc

# Ridge Regression

In [39]:
from sklearn import cross_validation
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size =0.2)

from sklearn.linear_model import ridge
from sklearn.preprocessing import MinMaxScaler
 
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

linridge = Ridge(alpha=1).fit(X_train_scaled, y_train)

print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))
print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test_scaled, y_test)))


ridge regression linear model intercept: 0.5105995517843949
ridge regression linear model coeff:
[ 0.31647709  0.20115017  0.11536082 -0.00886926  0.01195346  0.04005453
 -0.60510074 -0.05759595 -0.08763255 -0.44075012 -0.07159974  0.12477566
  0.09910973  0.0647425   0.16966092  0.11379669 -0.1485357  -0.02972799
  1.1766115   0.46126802  0.14572295]
Number of non-zero features: 21
R-squared score (training): 0.793
R-squared score (test): 0.780


# Lasso Regression

In [82]:
from sklearn.linear_model import Lasso

alpha_lasso = [1e-20, 1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]

train_score = []
test_score = []

for alpha in alpha_lasso:
    
    lasso_model= Lasso(alpha=alpha, max_iter=100000)
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size =0.2)
    lasso_model.fit(x_train, y_train)
    train_score.append(lasso_model.score(x_train, y_train))
    test_score.append(lasso_model.score(x_test, y_test))
lasso_scores = pd.DataFrame({'alpha_lasso':alpha_lasso, 'train_score':train_score, 'test_score':test_score})

# for index, coefficient in enumerate(lasso_model.coef_):
    



     alpha_lasso  train_score  test_score
0   1.000000e-20     0.789366    0.795891
1   1.000000e-15     0.792952    0.780651
2   1.000000e-10     0.786942    0.806365
3   1.000000e-08     0.794554    0.773992
4   1.000000e-05     0.787003    0.804539
5   1.000000e-04     0.795201    0.772055
6   1.000000e-03     0.791681    0.781448
7   1.000000e-02     0.776220    0.795328
8   1.000000e+00     0.713075    0.715367
9   5.000000e+00     0.672255    0.685048
10  1.000000e+01     0.660728    0.644450
intercept  0.25318805203354955


### Lasso Intercept and train, test score

In [None]:
print(lasso_scores.head(11))
print('intercept ',lasso_model.intercept_)

### Importance of features by lasso

In [91]:
df = pd.DataFrame({"features": x_train.columns, "coeficient": lasso_model.coef_} )
df.sort_values(by='coeficient', ascending= False)

Unnamed: 0,features,coeficient
18,onFootDistance,0.000207
2,healthLost,3.1e-05
13,horseRideDistance,1.2e-05
0,assists,0.0
11,numShips,0.0
19,weaponsUsed,0.0
17,castleTowerDestroys,-0.0
16,friendlyKills,-0.0
15,swimmingDistance,0.0
14,horseRideKills,0.0


# 

In [40]:
clf = LinearRegression()
clf.fit(x_train,y_train)
accuracy = clf.score(x_test,y_test)

In [43]:
pred_test = clf.predict(test)

In [61]:
clf.coef_

array([ 2.86516387e-02,  9.80931996e-03,  8.92999877e-05, -8.94610652e-03,
        2.56683216e-02,  3.44308922e-04, -5.48102298e-03, -1.53550441e-04,
       -8.02650882e-03, -8.62420953e-02, -1.40057895e-04,  1.62250147e-03,
        1.54626608e-02, -1.31714601e-06,  1.41787946e-01,  3.64862517e-04,
       -7.19374087e-02, -2.37457699e-02,  1.37218047e-04,  1.38473036e-02,
        7.34324466e-04])

In [62]:
clf.intercept_

-0.45071163503530387

In [44]:
pred_test = clf.predict(test)

In [45]:
submission=pd.read_csv("Sample_Submission.csv")

In [46]:
submission['bestSoldierPerc']=pred_test
submission['soldierId']=test_original['soldierId']

In [47]:
pd.DataFrame(submission, columns=['soldierId','bestSoldierPerc']).to_csv('sub.csv')

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeClassifier(random_state=101)

treeReg = DecisionTreeRegressor(random_state=0, max_depth=5)
modelRegTree = treeReg.fit(x,y)
# print(f'Decision tree has {treeReg.tree_.node_count} nodes with maximum depth {treeReg.tree_.max_depth}.')
# print('*'*40)
# print(f'Model Accuracy: {treeReg.score(x, y)}')

treeReg.score(x, y)

0.8325354984838659

In [49]:
pred_test_tree = treeReg.predict(test)

submission1=pd.read_csv("Sample_Submission.csv")

submission1['bestSoldierPerc']=pred_test_tree
submission1['soldierId']=test_original['soldierId']

pd.DataFrame(submission1, columns=['soldierId','bestSoldierPerc']).to_csv('sub_tree.csv')