In [1]:
dataset = spark.table("pugs")
cols = dataset.columns

In [2]:
display(dataset)

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

stages = [] # stages in our Pipeline
numericCols = ["solo_KillDeathRatio","solo_TimeSurvived","solo_RoundsPlayed","solo_WinTop10Ratio","solo_Top10s","solo_Top10Ratio","solo_Rating","solo_BestRating","solo_DamagePg","solo_HeadshotKillsPg","solo_KillsPg","solo_RoadKillsPg","solo_TeamKillsPg","solo_TimeSurvivedPg","solo_Top10sPg","solo_Kills","solo_Assists","solo_HeadshotKills","solo_HeadshotKillRatio","solo_VehicleDestroys","solo_RoadKills","solo_DailyKills","solo_WeeklyKills","solo_RoundMostKills","solo_LongestTimeSurvived","solo_RideDistance","solo_LongestKill","solo_Heals","solo_Revives","solo_Boosts","solo_DamageDealt","solo_DBNOs"]
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
stages += [assembler]

In [4]:
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)

# Keep relevant columns
selectedcols = ["solo_Wins","features"] + numericCols
dataset = dataset.select(selectedcols)
display(dataset)

In [5]:
display(dataset)

In [6]:
(trainingData, testData) = dataset.randomSplit([0.9, 0.1], seed = 100)
print trainingData.count()
print testData.count()

In [7]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="solo_Wins", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [8]:
predictions = lrModel.transform(testData)

In [9]:
predictions.printSchema()

In [10]:
# selected = predictions.select("solo_Suicides","solo_Wins","solo_TimeSurvived", "prediction", "solo_WinRatio", "solo_DamagePg","rawPrediction")
selected = predictions.select("solo_Wins", "prediction")
display(selected)

In [11]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="solo_Suicides")
evaluator.evaluate(predictions)

In [12]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="solo_Suicides", featuresCol="features", maxDepth=3)

# Train model with Training Data
lrModel = dt.fit(trainingData)

In [13]:
#The following part is decision trees.

In [14]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData)

In [15]:
print "numNodes = ", dtModel.numNodes
print "depth = ", dtModel.depth

In [16]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="solo_Suicides", featuresCol="features", maxDepth=3)

# Train model with Training Data
lrModel = dt.fit(trainingData)