# Benchmark Model

Our initial benchmark model is a logistic regression. 

In [6]:
file_path = '/home/zrc3hc/Chess/2. Models/combined_saved_games.csv'

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("benchmarkmodel").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
df = spark.read.csv(file_path, header = True, inferSchema = True)


spark = SparkSession.builder.appName("benchmarkmodel").getOrCreate()

df = spark.read.csv(file_path, header = True, inferSchema = True)



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/01 09:32:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/01 09:32:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [11]:
df.show(2)

+----+-------+---------+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|Move|game_id|next_move|result| a1| b1| c1| d1| e1| f1| g1| h1| a2| b2| c2| d2| e2| f2| g2| h2| a3| b3| c3| d3| e3| f3| g3| h3| a4| b4| c4| d4| e4| f4| g4| h4| a5| b5| c5| d5| e5| f5| g5| h5| a6| b6| c6| d6| e6| f6| g6| h6| a7| b7| c7| d7| e7| f7| g7| h7| a8| b8| c8| d8| e8| f8| g8| h8|
+----+-------+---------+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|  83|  15380|     h7g8|     1|  0|  0|  0|  0|  0|  0|  0|  5| -5|  0|  0|  0|  0|  1|  0| 10|  0|  0|  0|  0|  0|  0|  1|  1|  0|  0| 

In [10]:
df.count()

10018



# **Chess Piece Values**

| **Chess Piece**     | **Value** |
|----------------------|-----------|
| White Rook 1         | `5`       |
| White Rook 2         | `5`       |
| White Knight 1       | `3`       |
| White Knight 2       | `3`       |
| White Bishop 1       | `3`       |
| White Bishop 2       | `3`       |
| White Queen          | `9`       |
| White King           | `10`      |
| White Pawn 1–8       | `1`       |

**Note:** Black pieces have the same values as white pieces but are negative. 

**Note:** If the result is labeled 1, that means white won. If black won, result is labeled 0.


In [21]:
## Creating training/validation split

training_data, validation_data = df.randomSplit([0.9, 0.1], seed=42)

#specifying feature space

board_spots = [col for col in df.columns if col not in ['Move', 'game_id', 'next_move', 'result']]

vector_assembler = VectorAssembler(inputCols=board_spots, outputCol="features")
training_data = vector_assembler.transform(training_data)
validation_data = vector_assembler.transform(validation_data)

In [22]:
# Basic Logistic Model

logistic_regression = LogisticRegression(featuresCol="features", labelCol="result", maxIter=1000)
lr_model = logistic_regression.fit(training_data)
predictions = lr_model.transform(validation_data)
predictions.select("features", "result", "prediction", "probability").show(5)


In [24]:
evaluator = BinaryClassificationEvaluator(labelCol="result", rawPredictionCol="prediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"Area Under ROC Curve (AUC): {auc}")

Area Under ROC Curve (AUC): 0.7084113980627702


In [None]:
# Random Forest

# Set up the Random Forest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=10, seed=42)

# Train the model
model = rf.fit(train)

# Make predictions
predictions = model.transform(test)
predictions.select("features", "result", "prediction", "probability").show(5)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

In [None]:
# Champion Model, Gradient Boosting Machine

GBT = GBTClassifier(featuresCol="features", labelCol="result", maxIter=200)
GBT_model = GBT.fit(training_data)

# Predictions on Validation Set

predictions_GBT = GBT_model.transform(validation_data)
predictions_GBT.select("features", "result", "prediction", "probability").show(5)



In [None]:
# Evaluation of Winning Model

evaluator = BinaryClassificationEvaluator(labelCol="result", rawPredictionCol="prediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions_GBT)
print(f"Area Under ROC Curve (AUC): {auc}")

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="result", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_GBT)
print(f"Accuracy: {accuracy}")