In [7]:
# All project packages imported at the start

# Project packages
import pandas as pd
import numpy as np

# Visualisations
import matplotlib.pyplot as plt 
import seaborn as sns

# Statistics
from scipy import stats
from scipy.stats import norm, skew
from statistics import mode

# Machine Learning
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso, Ridge, RidgeCV, ElasticNet


from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge


    
train = pd.read_csv('train.csv')
    
test = pd.read_csv('test.csv')



In [8]:
from sklearn.utils import shuffle

train = shuffle(train,n_samples= 5000,)


In [9]:
train_ID = train['soldierId']
test_ID = test['soldierId']

train.drop("soldierId", axis = 1, inplace = True)
test.drop("soldierId", axis = 1, inplace = True)



train.drop("shipId", axis = 1, inplace = True)
test.drop("shipId", axis = 1, inplace = True)
train.drop("attackId", axis = 1, inplace = True)
test.drop("attackId", axis = 1, inplace = True)



In [10]:
train['knockedOutSoldiers'].fillna(train['knockedOutSoldiers'].mode()[0], inplace=True)
train['horseRideDistance'].fillna(train['horseRideDistance'].mode()[0], inplace=True)
train['respectEarned'].fillna(train['respectEarned'].mode()[0], inplace=True)

# Watchout for null features

In [11]:
train.isnull().sum()

assists                0
greekFireItems         0
healthLost             0
knockedOutSoldiers     0
throatSlits            0
healingPotionsUsed     0
killRank               0
killPoints             0
enemiesKilled          0
killingStreaks         0
farthermostKill        0
numShips               0
numSaves               0
horseRideDistance      0
horseRideKills         0
swimmingDistance       0
friendlyKills          0
castleTowerDestroys    0
onFootDistance         0
weaponsUsed            0
respectEarned          0
bestSoldierPerc        0
dtype: int64

# Preparing X and y  dataset

In [6]:
x = train.drop(['bestSoldierPerc',],1)
y = train.bestSoldierPerc

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size =0.2)

from sklearn.dummy import DummyRegressor
lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)

print('train score', lm_dummy_mean.score(X_train, y_train))
print('test score', lm_dummy_mean.score(X_test ,y_test))

train score 0.0
test score -4.527972761736443e-05


# Ridge Regression

In [14]:


x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2)

from sklearn.linear_model import ridge
from sklearn.preprocessing import MinMaxScaler
 
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

linridge = Ridge(alpha=1).fit(X_train_scaled, y_train)

print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))
print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test_scaled, y_test)))


ridge regression linear model intercept: 0.4491486628154672
ridge regression linear model coeff:
[ 0.12030408  0.14158355  0.02725922 -0.00668278  0.01742404 -0.00713891
 -0.67866375 -0.04911489 -0.1550708  -0.49785268 -0.02576611  0.12007147
  0.10785212  0.02796122 -0.0320601   0.11456683 -0.08819847  0.06588871
  0.83056446  0.34133208  0.24598142]
Number of non-zero features: 21
R-squared score (training): 0.822
R-squared score (test): 0.816


# Lasso Regression

In [15]:
from sklearn.linear_model import Lasso

alpha_lasso = [ 1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2,]

train_score = []
test_score = []

for alpha in alpha_lasso:
    
    lasso_model= Lasso(alpha=alpha, max_iter=1000)
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2)
    lasso_model.fit(x_train, y_train)
    train_score.append(lasso_model.score(x_train, y_train))
    test_score.append(lasso_model.score(x_test, y_test))
lasso_scores = pd.DataFrame({'alpha_lasso':alpha_lasso, 'train_score':train_score, 'test_score':test_score})

# for index, coefficient in enumerate(lasso_model.coef_):
    

In [16]:
lasso_scores

Unnamed: 0,alpha_lasso,train_score,test_score
0,1e-15,0.819394,0.827817
1,1e-10,0.821919,0.816167
2,1e-08,0.823179,0.812093
3,1e-05,0.82401,0.808895
4,0.0001,0.820189,0.824097
5,0.001,0.817214,0.833103
6,0.01,0.809758,0.807264


### Lasso Intercept and train, test score

In [17]:
print(lasso_scores.head(11))
print('intercept ',lasso_model.intercept_)

    alpha_lasso  train_score  test_score
0  1.000000e-15     0.819394    0.827817
1  1.000000e-10     0.821919    0.816167
2  1.000000e-08     0.823179    0.812093
3  1.000000e-05     0.824010    0.808895
4  1.000000e-04     0.820189    0.824097
5  1.000000e-03     0.817214    0.833103
6  1.000000e-02     0.809758    0.807264
intercept  0.1767152616664413


### Importance of features by lasso

In [18]:
df = pd.DataFrame({"features": x_train.columns, "coeficient": lasso_model.coef_} )
df.sort_values(by='coeficient', ascending= False)

Unnamed: 0,features,coeficient
19,weaponsUsed,0.013039
1,greekFireItems,0.007309
11,numShips,0.001185
20,respectEarned,0.000263
18,onFootDistance,0.000144
15,swimmingDistance,0.00013
13,horseRideDistance,6e-06
17,castleTowerDestroys,0.0
16,friendlyKills,-0.0
14,horseRideKills,-0.0


# Linear Regression

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


train_score = []
test_score = []


degree= [1, 2]



for i, degree in enumerate(degree):
    poly = PolynomialFeatures(degree = degree)
    x_poly = poly.fit_transform(x)
    X_train, X_test, y_train, y_test = train_test_split(x_poly, y, random_state=0)
    model = LinearRegression()
    model.fit(X_train, y_train)
    train_score.append(model.score(X_train, y_train))
    test_score.append(model.score(X_test, y_test))
    

    

In [20]:
df = pd.DataFrame({"degree": degree, "train_score": train_score, 'test_score': test_score} )
print(df)

   degree  train_score  test_score
0       2     0.824828    0.808536
1       2     0.896333    0.563574


# Decision tree 

In [21]:
from sklearn.tree import DecisionTreeRegressor

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)


decreg = DecisionTreeRegressor(max_depth=4)

decreg.fit(X_train, y_train)


feature_imp = []

for index, importance in enumerate(decreg.feature_importances_):
    feature_imp.append([importance ,X_train.columns[index]])
    
    
imp_df = pd.DataFrame({'features' : [sublist[1] for sublist in feature_imp] , 
                       'importance' : [sublist[0] for sublist in feature_imp]})

imp_df.sort_values(['importance'] ,ascending= False ,inplace= True)


print('train accuracy :', decreg.score(X_train, y_train))
print('test accuracy :', decreg.score(X_test, y_test))

train accuracy : 0.8355242891560588
test accuracy : 0.7893125711891572


In [22]:
print(imp_df.head())

          features  importance
18  onFootDistance    0.847777
6         killRank    0.139731
11        numShips    0.012492
0          assists    0.000000
19     weaponsUsed    0.000000


# Support vector Regression

In [18]:
from sklearn.model_selection import validation_curve
from sklearn.svm import SVR


X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

param_range =  np.logspace(-10,1,6) 



train_score, test_score = validation_curve(SVR() , X_train, y_train, param_name = 'gamma',
                                           param_range = param_range, cv= 3 )



result= (train_score.mean(axis=1), test_score.mean(axis=1))

In [19]:
print(result)

(array([0.34936128, 0.74305708, 0.82631018, 0.91209785, 0.90527244,
       0.90525877]), array([ 0.34523685,  0.73507572,  0.74580779,  0.36450174, -0.00287026,
       -0.00466934]))


# RandomForest Regression


In [20]:
from sklearn.ensemble import RandomForestRegressor


X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

param_range =  [0.2, 0.5, 0.8,1, None] 



train_score, test_score = validation_curve(RandomForestRegressor() , X_train, y_train, param_name = 'max_depth',
                                           param_range = param_range, cv= 3 )



result= (train_score.mean(axis=1), test_score.mean(axis=1))

max_depth = pd.DataFrame({'param_range': param_range , 'train': train_score.mean(axis=1), 'test': test_score.mean(axis=1)})

In [21]:
print(max_depth)

   param_range     train      test
0          0.2 -0.000343 -0.007105
1          0.5 -0.000202 -0.006038
2          0.8 -0.000117 -0.005365
3          1.0  0.627530  0.612822
4          NaN  0.966868  0.813262


In [22]:
from sklearn.ensemble import RandomForestRegressor


X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

param_range =  [1,5,10,20,30,40,50,60,80,100,120] 



train_score, test_score = validation_curve(RandomForestRegressor() , X_train, y_train, param_name = 'min_samples_leaf',
                                           param_range = param_range, cv= 3 )


result= (train_score.mean(axis=1), test_score.mean(axis=1))

min_sample = pd.DataFrame({'param_range': param_range , 'train': train_score.mean(axis=1), 'test': test_score.mean(axis=1)})


print(min_sample)

    param_range     train      test
0             1  0.967166  0.819845
1             5  0.910789  0.828286
2            10  0.876544  0.819468
3            20  0.836137  0.807423
4            30  0.811028  0.788147
5            40  0.792474  0.777087
6            50  0.783633  0.770990
7            60  0.770532  0.757817
8            80  0.730260  0.715493
9           100  0.661128  0.646082
10          120  0.624587  0.616077


# KNN

In [43]:
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)



n_neighbors  =  [1,2,3,4,5,6,7,9,11,13,15,17,19,21,23,25,27,29,35,41,49,55] 



train_score, test_score = validation_curve(KNeighborsRegressor() , X_train, y_train, param_name = 'n_neighbors',
                                           param_range = n_neighbors, cv= 3 )


result= (train_score.mean(axis=1), test_score.mean(axis=1))

min_sample = pd.DataFrame({'param_range': n_neighbors , 'train': train_score.mean(axis=1), 'test': test_score.mean(axis=1)})



In [44]:
min_sample

Unnamed: 0,param_range,train,test
0,1,1.0,0.540939
1,2,0.891359,0.649675
2,3,0.844431,0.69209
3,4,0.823297,0.709446
4,5,0.810421,0.712593
5,6,0.799984,0.716525
6,7,0.791709,0.722808
7,9,0.779799,0.721363
8,11,0.774404,0.724486
9,13,0.768805,0.728043


In [None]:
submission=pd.read_csv("Sample_Submission.csv")

In [None]:
submission['bestSoldierPerc']=pred_test
submission['soldierId']=test_original['soldierId']

In [None]:
pd.DataFrame(submission, columns=['soldierId','bestSoldierPerc']).to_csv('sub.csv')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeClassifier(random_state=101)

treeReg = DecisionTreeRegressor(random_state=0, max_depth=5)
modelRegTree = treeReg.fit(x,y)
# print(f'Decision tree has {treeReg.tree_.node_count} nodes with maximum depth {treeReg.tree_.max_depth}.')
# print('*'*40)
# print(f'Model Accuracy: {treeReg.score(x, y)}')

treeReg.score(x, y)

In [None]:
pred_test_tree = treeReg.predict(test)

submission1=pd.read_csv("Sample_Submission.csv")

submission1['bestSoldierPerc']=pred_test_tree
submission1['soldierId']=test_original['soldierId']

pd.DataFrame(submission1, columns=['soldierId','bestSoldierPerc']).to_csv('sub_tree.csv')