In [151]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import col, log

In [152]:
spark = SparkSession.builder.appName('Scores').getOrCreate()

dataset = spark.read.option("inferSchema", "true").csv("Player_List_Cleaned_Data.csv", header=True)

dataset.printSchema()

root
 |-- Player: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Overall_Score: double (nullable = true)
 |-- Potential_Score: double (nullable = true)
 |-- Market_Value: double (nullable = true)
 |-- Weekly_Salary: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Preferred_Foot: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Player_Work_Rate: string (nullable = true)
 |-- Kit_Number: double (nullable = true)
 |-- Joined_Club: string (nullable = true)
 |-- Contract_Length: double (nullable = true)
 |-- Ball_Skills: double (nullable = true)
 |-- Defence: double (nullable = true)
 |-- Mental: double (nullable = true)
 |-- Passing: double (nullable = true)
 |-- Physical: double (nullable = true)
 |-- Shooting: double (nullable = true)
 |-- Goalkeeping: double (nullable = true)
 |-- Description_Player: string (nullable 

In [153]:
dataset.show()

+--------------------+--------------+-------------------+-------------+---------------+------------+-------------+------+------+----+--------------+--------------+----------------+----------+--------------+---------------+-----------+------------------+------------------+------------------+-----------------+--------+-----------+--------------------+
|              Player|       Country|               Club|Overall_Score|Potential_Score|Market_Value|Weekly_Salary|Height|Weight| Age|Preferred_Foot|      Position|Player_Work_Rate|Kit_Number|   Joined_Club|Contract_Length|Ball_Skills|           Defence|            Mental|           Passing|         Physical|Shooting|Goalkeeping|  Description_Player|
+--------------------+--------------+-------------------+-------------+---------------+------------+-------------+------+------+----+--------------+--------------+----------------+----------+--------------+---------------+-----------+------------------+------------------+------------------+-----

In [154]:
list_col = ['Market_Value' ,'Player', 'Country', 'Club', 'Overall_Score',
            'Potential_Score', 'Weekly_Salary', 'Height', 'Weight',
            'Preferred_Foot', 'Position', 'Player_Work_Rate', 'Kit_Number',
            'Joined_Club', 'Contract_Length','Description_Player']

dataset = dataset[dataset['Market_Value'] != 1000]
dataset = dataset.withColumn("Market_Value_Log", log(col("Market_Value")))
dataset = dataset.drop(*list_col)
dataset.show(20)

+----+-----------+------------------+------------------+------------------+-----------------+--------+-----------+------------------+
| Age|Ball_Skills|           Defence|            Mental|           Passing|         Physical|Shooting|Goalkeeping|  Market_Value_Log|
+----+-----------+------------------+------------------+------------------+-----------------+--------+-----------+------------------+
|34.0|       96.0| 26.33333333333333|              77.0|              89.0|80.85714285714286|  86.875|       10.8|18.172219384653864|
|33.0|       86.5|              32.0| 81.16666666666667| 75.33333333333333|80.28571428571429|    88.0|       10.2| 18.59882692933584|
|23.0|       92.0| 30.66666666666667| 75.83333333333333|              78.0|87.42857142857143|    80.5|        8.4|19.083368717027604|
|28.0|       21.0|              19.0|              47.5|              32.0|59.42857142857143|    18.5|       87.4|18.534009429259367|
|30.0|       89.0|              62.0| 83.83333333333333| 93.33

In [155]:
dataset.describe().show()

+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+
|summary|               Age|       Ball_Skills|           Defence|            Mental|          Passing|          Physical|          Shooting|      Goalkeeping|  Market_Value_Log|
+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+
|  count|             18489|             18489|             18489|             18489|            18489|             18489|             18489|            18489|             18489|
|   mean|25.634214938612146|57.333576721293745| 47.21771503777027| 54.67234571907625|54.16207835289443| 64.44685256870905| 48.01286575801828|16.28146465465954|13.897814814212955|
| stddev| 4.729868608623024|17.384386468331954|20.359235083089224|11.518316179421046|14.70088090841095|10

In [156]:
assembler = VectorAssembler(inputCols=['Age', 'Ball_Skills', 'Defence',
                                        'Mental' ,'Passing', 'Physical',
                                        'Shooting', 'Goalkeeping'], outputCol='features')

data_set = assembler.transform(dataset)
data_set.select(['features', 'Market_Value_Log']).show(20)

+--------------------+------------------+
|            features|  Market_Value_Log|
+--------------------+------------------+
|[34.0,96.0,26.333...|18.172219384653864|
|[33.0,86.5,32.0,8...| 18.59882692933584|
|[23.0,92.0,30.666...|19.083368717027604|
|[28.0,21.0,19.0,4...|18.534009429259367|
|[30.0,89.0,62.0,8...| 18.64781631653611|
|[29.0,95.0,32.0,7...|18.675322962325946|
|[36.0,88.0,26.666...|17.622173047734595|
|[28.0,84.0,41.333...|18.679191439103867|
|[22.0,29.0,16.666...| 18.59882692933584|
|[29.0,28.5,16.666...|18.222229805228526|
|[26.0,84.0,83.0,8...|18.497641785088494|
|[28.0,31.5,17.333...|18.358805340234277|
|[29.0,89.0,40.666...|18.430631074805532|
|[30.0,73.0,90.0,7...|18.269857854217783|
|[29.0,74.0,86.666...| 18.29284737244248|
|[29.0,18.0,18.0,4...| 18.26402693390699|
|[24.0,69.0,88.0,7...|18.529535148864447|
|[21.0,81.5,39.0,7...| 18.78184559316395|
|[27.0,84.0,69.0,8...| 18.49300140553199|
|[27.0,74.0,89.333...|18.455082170669698|
+--------------------+------------

In [157]:
train_data, test_data = data_set.randomSplit([0.8, 0.2], seed=100)

In [158]:
train_data.show()

+----+-----------+------------------+-----------------+-----------------+-----------------+--------+-----------+------------------+--------------------+
| Age|Ball_Skills|           Defence|           Mental|          Passing|         Physical|Shooting|Goalkeeping|  Market_Value_Log|            features|
+----+-----------+------------------+-----------------+-----------------+-----------------+--------+-----------+------------------+--------------------+
|17.0|       55.5| 65.66666666666667|56.33333333333334|55.33333333333334|59.57142857142857|   40.75|        8.6|14.151982794585487|[17.0,55.5,65.666...|
|17.0|       62.0|              17.0|50.16666666666666|44.33333333333334|65.42857142857143|    57.5|        9.2|14.403297222866392|[17.0,62.0,17.0,5...|
|17.0|       65.5|              35.0|56.33333333333334|57.33333333333334|66.28571428571429|  55.875|       11.8|14.403297222866392|[17.0,65.5,35.0,5...|
|17.0|       70.0| 62.33333333333334|63.66666666666666|             64.0|         

In [159]:
test_data.show()

+----+-----------+------------------+-----------------+-----------------+-----------------+--------+-----------+------------------+--------------------+
| Age|Ball_Skills|           Defence|           Mental|          Passing|         Physical|Shooting|Goalkeeping|  Market_Value_Log|            features|
+----+-----------+------------------+-----------------+-----------------+-----------------+--------+-----------+------------------+--------------------+
|17.0|       69.5| 63.66666666666666|             60.5|             64.0|             60.0|    43.0|        9.2|14.690979295318174|[17.0,69.5,63.666...|
|18.0|       56.0| 64.66666666666667|56.16666666666666|49.66666666666666|67.42857142857143|    36.0|        9.0|14.346138809026444|[18.0,56.0,64.666...|
|18.0|       58.5| 63.33333333333334|55.33333333333334|64.66666666666667|63.14285714285714|  47.375|       12.0|14.457364444136669|[18.0,58.5,63.333...|
|18.0|       63.5|              31.0|             50.5|58.66666666666666|68.428571

In [160]:
lr = LinearRegression(featuresCol = 'features', labelCol='Market_Value_Log')
lrModel = lr.fit(train_data)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: [-0.10025385910045398,0.038349585478780535,0.008372552390026681,0.07573511017891024,-0.008129940334191132,0.016596978949259295,0.012000174167611363,0.07388013015310238]
Intercept: 7.327116958831785


In [161]:
test_stats = lrModel.evaluate(test_data)
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2_Score: {test_stats.r2}")
print(f"meanSquaredError: {test_stats.meanSquaredError}")

RMSE: 0.7546412788560665
R2_Score: 0.6373100542484016
meanSquaredError: 0.5694834597535194


In [162]:
predict_data = lrModel.transform(test_data)
selected = predict_data.select('Market_Value_Log', 'prediction')
selected.show()

+------------------+------------------+
|  Market_Value_Log|        prediction|
+------------------+------------------+
|14.690979295318174|15.074331455071572|
|14.346138809026444|14.277589010164903|
|14.457364444136669|14.464250335679875|
|14.346138809026444|13.820166508931486|
| 14.28551418721001|14.110008152856874|
| 14.22097566607244|13.888160631709217|
|14.403297222866392|14.058422776491707|
| 14.22097566607244| 14.20971123447475|
| 15.03928598958639|15.628530684892795|
|14.346138809026444|13.702111591108284|
| 14.22097566607244|14.799122511174108|
|15.068273526459642|15.237647454885469|
|14.690979295318174|14.743242983965878|
| 14.22097566607244|14.440329968381949|
|15.123843377614453|15.187223824831683|
|15.761420707019587|15.310283399581898|
|14.557447902693651| 14.28671533058127|
|14.457364444136669| 13.67088163309209|
|14.648419680899378| 13.86030608758782|
|15.226497531674536|14.839648255288921|
+------------------+------------------+
only showing top 20 rows

