In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('blackwater').getOrCreate()

In [2]:
from pyspark.ml.regression import RandomForestRegressor

In [3]:
input_data = spark.read.csv('./data/train.csv', header=True, inferSchema=True)

In [102]:
input_test_data = spark.read.csv('./data/test.csv', header=True, inferSchema=True)

In [8]:
input_data.count()

4357336

In [4]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [5]:
input_data.printSchema()

root
 |-- soldierId: double (nullable = true)
 |-- shipId: double (nullable = true)
 |-- attackId: double (nullable = true)
 |-- assists: double (nullable = true)
 |-- greekFireItems: double (nullable = true)
 |-- healthLost: double (nullable = true)
 |-- knockedOutSoldiers: double (nullable = true)
 |-- throatSlits: double (nullable = true)
 |-- healingPotionsUsed: double (nullable = true)
 |-- killRank: double (nullable = true)
 |-- killPoints: double (nullable = true)
 |-- enemiesKilled: double (nullable = true)
 |-- killingStreaks: double (nullable = true)
 |-- farthermostKill: double (nullable = true)
 |-- numShips: double (nullable = true)
 |-- numSaves: double (nullable = true)
 |-- horseRideDistance: double (nullable = true)
 |-- horseRideKills: double (nullable = true)
 |-- swimmingDistance: double (nullable = true)
 |-- friendlyKills: double (nullable = true)
 |-- castleTowerDestroys: double (nullable = true)
 |-- onFootDistance: double (nullable = true)
 |-- weaponsUsed: dou

In [6]:
assembler = VectorAssembler(inputCols=['shipId', 'attackId', 'assists', 'greekFireItems', 'healthLost', 'knockedOutSoldiers', 'throatSlits', 'healingPotionsUsed', 'killRank', 'killPoints', 'enemiesKilled', 'killingStreaks', 'farthermostKill', 'numShips', 'numSaves', 'horseRideDistance', 'horseRideKills', 'swimmingDistance', 'friendlyKills', 'castleTowerDestroys', 'onFootDistance', 'weaponsUsed', 'respectEarned'], outputCol='features')

In [7]:
output_data = assembler.transform(input_data)

In [103]:
output_test_data = assembler.transform(input_test_data)

In [8]:
output_data.printSchema()

root
 |-- soldierId: double (nullable = true)
 |-- shipId: double (nullable = true)
 |-- attackId: double (nullable = true)
 |-- assists: double (nullable = true)
 |-- greekFireItems: double (nullable = true)
 |-- healthLost: double (nullable = true)
 |-- knockedOutSoldiers: double (nullable = true)
 |-- throatSlits: double (nullable = true)
 |-- healingPotionsUsed: double (nullable = true)
 |-- killRank: double (nullable = true)
 |-- killPoints: double (nullable = true)
 |-- enemiesKilled: double (nullable = true)
 |-- killingStreaks: double (nullable = true)
 |-- farthermostKill: double (nullable = true)
 |-- numShips: double (nullable = true)
 |-- numSaves: double (nullable = true)
 |-- horseRideDistance: double (nullable = true)
 |-- horseRideKills: double (nullable = true)
 |-- swimmingDistance: double (nullable = true)
 |-- friendlyKills: double (nullable = true)
 |-- castleTowerDestroys: double (nullable = true)
 |-- onFootDistance: double (nullable = true)
 |-- weaponsUsed: dou

In [9]:
output_data.head(2)

[Row(soldierId=0.0, shipId=24.0, attackId=0.0, assists=0.0, greekFireItems=5.0, healthLost=247.3, knockedOutSoldiers=2.0, throatSlits=0.0, healingPotionsUsed=4.0, killRank=17.0, killPoints=1050.0, enemiesKilled=2.0, killingStreaks=1.0, farthermostKill=65.32, numShips=28.0, numSaves=1.0, horseRideDistance=591.3, horseRideKills=0.0, swimmingDistance=0.0, friendlyKills=0.0, castleTowerDestroys=0.0, onFootDistance=782.4, weaponsUsed=4.0, respectEarned=1458.0, bestSoldierPerc=0.8571, features=DenseVector([24.0, 0.0, 0.0, 5.0, 247.3, 2.0, 0.0, 4.0, 17.0, 1050.0, 2.0, 1.0, 65.32, 28.0, 1.0, 591.3, 0.0, 0.0, 0.0, 0.0, 782.4, 4.0, 1458.0])),
 Row(soldierId=1.0, shipId=440875.0, attackId=1.0, assists=1.0, greekFireItems=0.0, healthLost=37.65, knockedOutSoldiers=1.0, throatSlits=1.0, healingPotionsUsed=0.0, killRank=45.0, killPoints=1072.0, enemiesKilled=1.0, killingStreaks=1.0, farthermostKill=-13.55, numShips=23.0, numSaves=0.0, horseRideDistance=0.0, horseRideKills=0.0, swimmingDistance=0.0, f

In [104]:
final_data = output_data.select('features', 'bestSoldierPerc')

In [107]:
final_test_data = output_test_data.select('features')

In [11]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- bestSoldierPerc: double (nullable = true)



In [48]:
train,test,cv = final_data.randomSplit([0.4,0.3,0.3])

In [49]:
train.count()

1743453

In [108]:
#model = RandomForestRegressor(labelCol='bestSoldierPerc')
model = RandomForestRegressor(labelCol='bestSoldierPerc',numTrees=4, featureSubsetStrategy="all",impurity='variance', maxDepth=9, maxBins=64)

In [109]:
model=model.fit(final_data)

In [16]:
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.evaluation import RegressionEvaluator

In [110]:
predictions = model.transform(final_test_data)

In [111]:
predictions.printSchema()

root
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [100]:
evaluator = RegressionEvaluator(labelCol="bestSoldierPerc", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)

In [101]:
print(mae)

0.07486700579683114


In [112]:
res = predictions.select('prediction')

In [113]:
res.write.csv('out.csv')

In [114]:
sid = input_test_data.select('soldierId')

In [115]:
sid.write.csv('sid.csv')

In [83]:
test.count()

1307262

In [117]:
import pandas as pd

In [119]:
df_test = pd.read_csv('./data/test.csv')

In [120]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,index,soldierId,shipId,attackId,assists,greekFireItems,healthLost,knockedOutSoldiers,throatSlits,...,numShips,numSaves,horseRideDistance,horseRideKills,swimmingDistance,friendlyKills,castleTowerDestroys,onFootDistance,weaponsUsed,respectEarned
0,0,0,47734.0,1659463.0,47734.0,0.0,0.0,100.0,1.0,0.0,...,45.0,0.0,0.0,0.0,0.0,1.0,0.0,421.5,7.0,1500.0
1,1,1,47735.0,1659508.0,47735.0,0.0,1.0,400.0,2.0,0.0,...,47.0,1.0,0.0,0.0,0.0,0.0,0.0,655.8,4.0,1526.0
2,2,2,47736.0,1659555.0,47736.0,0.0,0.0,0.0,0.0,0.0,...,28.0,0.0,0.0,0.0,0.0,0.0,0.0,74.58,1.0,1475.0
3,3,3,47737.0,1659621.0,47737.0,0.0,0.0,68.6,0.0,0.0,...,92.0,0.0,0.0,0.0,0.0,0.0,0.0,167.2,2.0,1464.0
4,4,4,47738.0,1659675.0,47738.0,0.0,1.0,370.5,3.0,0.0,...,26.0,0.0,0.0,0.0,0.0,0.0,0.0,146.7,3.0,1505.0


In [128]:
df_test['soldierId'] = df_test['soldierId'].astype(int)

In [129]:
df_test.dtypes

Unnamed: 0               int64
index                    int64
soldierId                int64
shipId                 float64
attackId               float64
assists                float64
greekFireItems         float64
healthLost             float64
knockedOutSoldiers     float64
throatSlits            float64
healingPotionsUsed     float64
killRank               float64
killPoints             float64
enemiesKilled          float64
killingStreaks         float64
farthermostKill        float64
numShips               float64
numSaves               float64
horseRideDistance      float64
horseRideKills         float64
swimmingDistance       float64
friendlyKills          float64
castleTowerDestroys    float64
onFootDistance         float64
weaponsUsed            float64
respectEarned          float64
dtype: object

In [131]:
df_test['soldierId'].to_csv('soid.csv', index=False)

  """Entry point for launching an IPython kernel.
