In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j
!pip install langchain-huggingface

import os
import sys

import findspark
findspark.init()
findspark.find()

from pyspark import SparkContext

from pyspark.sql import SparkSession, DataFrame

from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import functions as F
from pyspark.sql import types as T

from pyspark.ml.feature import StringIndexer, Imputer
from pyspark.ml.feature import VectorAssembler, ChiSqSelector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,619 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,506 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy

In [2]:
spark = SparkSession.builder \
               .appName('Proyecto_final') \
               .getOrCreate()

In [3]:
spark

In [4]:
#Se especifica la ubicación del archivo donde se tienen los datos
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
#Se abre el archivo
%cd /content/drive/My Drive/
#df = pd.read_csv('events.csv')
df = spark.read.csv(path = '/content/drive/My Drive/events.csv', header = True)

/content/drive/My Drive


In [None]:
#Se imprime el schema de los datos
df.printSchema()

root
 |-- id_odsp: string (nullable = true)
 |-- id_event: string (nullable = true)
 |-- sort_order: string (nullable = true)
 |-- time: string (nullable = true)
 |-- text: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_type2: string (nullable = true)
 |-- side: string (nullable = true)
 |-- event_team: string (nullable = true)
 |-- opponent: string (nullable = true)
 |-- player: string (nullable = true)
 |-- player2: string (nullable = true)
 |-- player_in: string (nullable = true)
 |-- player_out: string (nullable = true)
 |-- shot_place: string (nullable = true)
 |-- shot_outcome: string (nullable = true)
 |-- is_goal: string (nullable = true)
 |-- location: string (nullable = true)
 |-- bodypart: string (nullable = true)
 |-- assist_method: string (nullable = true)
 |-- situation: string (nullable = true)
 |-- fast_break: string (nullable = true)



In [None]:
#Se guardan los nombres de las columnas en una variable
columns = df.columns
columns

['id_odsp',
 'id_event',
 'sort_order',
 'time',
 'text',
 'event_type',
 'event_type2',
 'side',
 'event_team',
 'opponent',
 'player',
 'player2',
 'player_in',
 'player_out',
 'shot_place',
 'shot_outcome',
 'is_goal',
 'location',
 'bodypart',
 'assist_method',
 'situation',
 'fast_break']

In [None]:
#Se hace una variable con las columnas que son de tipo string para quitarlas de la variable de las columnas
string_columns = ['id_odsp', 'id_event', 'text', 'event_team', 'opponent', 'player', 'player2', 'player_in', 'player_out']
for column in string_columns:
  columns.remove(column)

In [None]:
#Se cambian las variables que son numericas a tipo int
for column in columns:
  df = df.withColumn(column, F.col(column).cast(T.IntegerType()))

In [None]:
#Se imprime el schema con los nuevos tipos de variables.
df.printSchema()

root
 |-- id_odsp: string (nullable = true)
 |-- id_event: string (nullable = true)
 |-- sort_order: integer (nullable = true)
 |-- time: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- event_type: integer (nullable = true)
 |-- event_type2: integer (nullable = true)
 |-- side: integer (nullable = true)
 |-- event_team: string (nullable = true)
 |-- opponent: string (nullable = true)
 |-- player: string (nullable = true)
 |-- player2: string (nullable = true)
 |-- player_in: string (nullable = true)
 |-- player_out: string (nullable = true)
 |-- shot_place: integer (nullable = true)
 |-- shot_outcome: integer (nullable = true)
 |-- is_goal: integer (nullable = true)
 |-- location: integer (nullable = true)
 |-- bodypart: integer (nullable = true)
 |-- assist_method: integer (nullable = true)
 |-- situation: integer (nullable = true)
 |-- fast_break: integer (nullable = true)



In [None]:
df.count()

941009

In [None]:
#Se dejan solo aquellos eventos que pertenecen a un gol

df_ml = df.filter(df.is_goal == 1).select('is_goal', 'time', 'event_team', 'opponent')

In [None]:
df_ml.show()

+-------+----+--------------------+--------------------+
|is_goal|time|          event_team|            opponent|
+-------+----+--------------------+--------------------+
|      1|  17|   Borussia Dortmund|          Hamburg SV|
|      1|  29|   Borussia Dortmund|          Hamburg SV|
|      1|  49|   Borussia Dortmund|          Hamburg SV|
|      1|  80|          Hamburg SV|   Borussia Dortmund|
|      1|  48|         SC Freiburg|         FC Augsburg|
|      1|  53|         FC Augsburg|         SC Freiburg|
|      1|  55|         SC Freiburg|         FC Augsburg|
|      1|  81|         FC Augsburg|         SC Freiburg|
|      1|  60|       Werder Bremen|      Kaiserslautern|
|      1|  81|       Werder Bremen|      Kaiserslautern|
|      1|  28|             Lorient| Paris Saint-Germain|
|      1|  34|                Caen|        Valenciennes|
|      1|  80|            Nurnberg|       Hertha Berlin|
|      1|  17|Evian Thonon Gail...|               Brest|
|      1|  20|Evian Thonon Gail

In [None]:
df_ml_time_sections = df_ml.withColumn('time', F.when((df_ml.time >= 0) & (df_ml.time <= 15), 1) \
                                       .when((df_ml.time > 15) & (df_ml.time <= 30), 2) \
                                       .when((df_ml.time > 30) & (df_ml.time <= 45), 3) \
                                       .when((df_ml.time > 45) & (df_ml.time <= 60), 4) \
                                       .when((df_ml.time > 60) & (df_ml.time <= 75), 5) \
                                       .when(df_ml.time > 75, 6)
                                       .otherwise(0))

In [None]:
df_ml_time_sections.show()

+-------+----+--------------------+--------------------+
|is_goal|time|          event_team|            opponent|
+-------+----+--------------------+--------------------+
|      1|   2|   Borussia Dortmund|          Hamburg SV|
|      1|   2|   Borussia Dortmund|          Hamburg SV|
|      1|   4|   Borussia Dortmund|          Hamburg SV|
|      1|   6|          Hamburg SV|   Borussia Dortmund|
|      1|   4|         SC Freiburg|         FC Augsburg|
|      1|   4|         FC Augsburg|         SC Freiburg|
|      1|   4|         SC Freiburg|         FC Augsburg|
|      1|   6|         FC Augsburg|         SC Freiburg|
|      1|   4|       Werder Bremen|      Kaiserslautern|
|      1|   6|       Werder Bremen|      Kaiserslautern|
|      1|   2|             Lorient| Paris Saint-Germain|
|      1|   3|                Caen|        Valenciennes|
|      1|   6|            Nurnberg|       Hertha Berlin|
|      1|   2|Evian Thonon Gail...|               Brest|
|      1|   2|Evian Thonon Gail

In [None]:
teams_scoring_times = df_ml_time_sections.groupby('event_team', 'time').count()
teams_scoring_times.show()

+--------------------+----+-----+
|          event_team|time|count|
+--------------------+----+-----+
|             Levante|   6|   42|
| Deportivo La Coruna|   1|   17|
|        Norwich City|   3|   10|
|       Werder Bremen|   3|   52|
|        Valenciennes|   3|   18|
|          Hamburg SV|   3|   24|
|Evian Thonon Gail...|   6|   39|
|         Aston Villa|   3|   21|
|      Kaiserslautern|   5|    6|
|       Hellas Verona|   4|   21|
|           Barcelona|   4|  106|
|            Guingamp|   5|   32|
|             Cardiff|   3|    2|
|             Sochaux|   5|   20|
|            Mallorca|   5|   14|
|            Juventus|   6|   78|
|       Real Zaragoza|   2|   15|
|              Torino|   2|   35|
|       Hertha Berlin|   5|   29|
|             Sochaux|   4|   22|
+--------------------+----+-----+
only showing top 20 rows



In [None]:
teams_scoring_times.printSchema()

root
 |-- event_team: string (nullable = true)
 |-- time: integer (nullable = false)
 |-- count: long (nullable = false)



In [None]:
teams_scoring_times = teams_scoring_times.withColumn('count', F.col('count').cast(T.IntegerType()))

In [None]:
teams_scoring_times.printSchema()

root
 |-- event_team: string (nullable = true)
 |-- time: integer (nullable = false)
 |-- count: integer (nullable = false)



In [None]:
teams_scoring_times = teams_scoring_times.withColumn('count', F.ceil(F.col('count')/6))
teams_scoring_times.show()

+--------------------+----+-----+
|          event_team|time|count|
+--------------------+----+-----+
|             Levante|   6|    7|
| Deportivo La Coruna|   1|    3|
|        Norwich City|   3|    2|
|       Werder Bremen|   3|    9|
|        Valenciennes|   3|    3|
|          Hamburg SV|   3|    4|
|Evian Thonon Gail...|   6|    7|
|         Aston Villa|   3|    4|
|      Kaiserslautern|   5|    1|
|       Hellas Verona|   4|    4|
|           Barcelona|   4|   18|
|            Guingamp|   5|    6|
|             Cardiff|   3|    1|
|             Sochaux|   5|    4|
|            Mallorca|   5|    3|
|            Juventus|   6|   13|
|       Real Zaragoza|   2|    3|
|              Torino|   2|    6|
|       Hertha Berlin|   5|    5|
|             Sochaux|   4|    4|
+--------------------+----+-----+
only showing top 20 rows



In [None]:
teams_conceding_times = df_ml_time_sections.groupby('opponent', 'time').count()
teams_conceding_times.show()

+--------------------+----+-----+
|            opponent|time|count|
+--------------------+----+-----+
|             Levante|   6|   71|
| Deportivo La Coruna|   1|   22|
|        Norwich City|   3|   19|
|        Valenciennes|   3|   28|
|          Hamburg SV|   3|   49|
|       Werder Bremen|   3|   55|
|Evian Thonon Gail...|   6|   49|
|         Aston Villa|   3|   31|
|      Kaiserslautern|   5|   13|
|       Hellas Verona|   4|   33|
|           Barcelona|   4|   35|
|             Cardiff|   3|   10|
|            Guingamp|   5|   27|
|             Sochaux|   5|   33|
|       Real Zaragoza|   2|   23|
|            Juventus|   6|   27|
|            Mallorca|   5|   21|
|              Torino|   2|   28|
|       Hertha Berlin|   5|   33|
|               Mainz|   3|   38|
+--------------------+----+-----+
only showing top 20 rows



In [None]:
teams_conceding_times.printSchema()

root
 |-- opponent: string (nullable = true)
 |-- time: integer (nullable = false)
 |-- count: long (nullable = false)



In [None]:
teams_conceding_times = teams_conceding_times.withColumn('count', F.col('count').cast(T.IntegerType()))
teams_conceding_times.printSchema()

root
 |-- opponent: string (nullable = true)
 |-- time: integer (nullable = false)
 |-- count: integer (nullable = false)



In [None]:
teams_conceding_times = teams_conceding_times.withColumn('count', F.ceil(F.col('count')/6))
teams_conceding_times.show()

+--------------------+----+-----+
|            opponent|time|count|
+--------------------+----+-----+
|             Levante|   6|   12|
| Deportivo La Coruna|   1|    4|
|        Norwich City|   3|    4|
|        Valenciennes|   3|    5|
|          Hamburg SV|   3|    9|
|       Werder Bremen|   3|   10|
|Evian Thonon Gail...|   6|    9|
|         Aston Villa|   3|    6|
|      Kaiserslautern|   5|    3|
|       Hellas Verona|   4|    6|
|           Barcelona|   4|    6|
|             Cardiff|   3|    2|
|            Guingamp|   5|    5|
|             Sochaux|   5|    6|
|       Real Zaragoza|   2|    4|
|            Juventus|   6|    5|
|            Mallorca|   5|    4|
|              Torino|   2|    5|
|       Hertha Berlin|   5|    6|
|               Mainz|   3|    7|
+--------------------+----+-----+
only showing top 20 rows



In [None]:
#Alaves se mostraba como nulo para goles concedidos en los primero 15 minutos por lo que se agrega al DF con un total de 0.

Alaves = spark.createDataFrame([('Alaves', 1, 0)], ['opponent', 'time', 'count'])
Alaves.show()

+--------+----+-----+
|opponent|time|count|
+--------+----+-----+
|  Alaves|   1|    0|
+--------+----+-----+



In [None]:
teams_conceding_times = teams_conceding_times.union(Alaves)

In [None]:
teams_scoring_1 = teams_scoring_times.filter(teams_scoring_times.time == 1).select('event_team', 'count').withColumnRenamed("count", "scored_0to15")
teams_scoring_2 = teams_scoring_times.filter(teams_scoring_times.time == 2).select('event_team', 'count').withColumnRenamed("count", "scored_15to30").withColumnRenamed("event_team", "team")
teams_scoring_3 = teams_scoring_times.filter(teams_scoring_times.time == 3).select('event_team', 'count').withColumnRenamed("count", "scored_30to45").withColumnRenamed("event_team", "team")
teams_scoring_4 = teams_scoring_times.filter(teams_scoring_times.time == 4).select('event_team', 'count').withColumnRenamed("count", "scored_45to60").withColumnRenamed("event_team", "team")
teams_scoring_5 = teams_scoring_times.filter(teams_scoring_times.time == 5).select('event_team', 'count').withColumnRenamed("count", "scored_60to75").withColumnRenamed("event_team", "team")
teams_scoring_6 = teams_scoring_times.filter(teams_scoring_times.time == 6).select('event_team', 'count').withColumnRenamed("count", "scored_75to90").withColumnRenamed("event_team", "team")
teams_scoring_6.show()

+--------------------+-------------+
|                team|scored_75to90|
+--------------------+-------------+
|             Levante|            7|
|Evian Thonon Gail...|            7|
|            Juventus|           13|
|      Sporting Gijon|            4|
|        Stade Rennes|           10|
|                Nice|            9|
|               Siena|            3|
|              Nantes|            5|
|            Toulouse|           10|
|    Bayer Leverkusen|           11|
|               Elche|            3|
|            Nurnberg|            4|
|             AS Roma|           14|
|       Bayern Munich|           16|
|            Atalanta|            9|
|           Dijon FCO|            3|
|          Stoke City|            6|
|            Mallorca|            3|
|             Livorno|            1|
|             Almeria|            3|
+--------------------+-------------+
only showing top 20 rows



In [None]:
scoring_times = teams_scoring_1
df_to_join = [teams_scoring_2, teams_scoring_3, teams_scoring_4, teams_scoring_5, teams_scoring_6]

for DF in df_to_join:
  scoring_times = scoring_times.join(DF, scoring_times.event_team == DF.team, 'outer').drop('team')
scoring_times.show()

+-----------------+------------+-------------+-------------+-------------+-------------+-------------+
|       event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|
+-----------------+------------+-------------+-------------+-------------+-------------+-------------+
|       AC Ajaccio|           4|            2|            4|            2|            3|            6|
|         AC Milan|           7|            8|            9|           11|            9|           13|
|       AJ Auxerre|           1|            1|            2|            1|            2|            3|
|        AS Monaco|           6|            6|            7|            5|            8|           10|
|AS Nancy Lorraine|           3|            2|            3|            2|            3|            4|
|          AS Roma|          10|           10|           10|           10|           11|           14|
|           Alaves|           1|            1|            1|            1

In [None]:
teams_conceding_1 = teams_conceding_times.filter(teams_conceding_times.time == 1).select('opponent', 'count').withColumnRenamed("count", "conceded_0to15")
teams_conceding_2 = teams_conceding_times.filter(teams_conceding_times.time == 2).select('opponent', 'count').withColumnRenamed("count", "conceded_15to30").withColumnRenamed("opponent", "team")
teams_conceding_3 = teams_conceding_times.filter(teams_conceding_times.time == 3).select('opponent', 'count').withColumnRenamed("count", "conceded_30to45").withColumnRenamed("opponent", "team")
teams_conceding_4 = teams_conceding_times.filter(teams_conceding_times.time == 4).select('opponent', 'count').withColumnRenamed("count", "conceded_45to60").withColumnRenamed("opponent", "team")
teams_conceding_5 = teams_conceding_times.filter(teams_conceding_times.time == 5).select('opponent', 'count').withColumnRenamed("count", "conceded_60to75").withColumnRenamed("opponent", "team")
teams_conceding_6 = teams_conceding_times.filter(teams_conceding_times.time == 6).select('opponent', 'count').withColumnRenamed("count", "conceded_75to90").withColumnRenamed("opponent", "team")
teams_conceding_1.show()

+-------------------+--------------+
|           opponent|conceded_0to15|
+-------------------+--------------+
|Deportivo La Coruna|             4|
|            Udinese|             6|
|         AC Ajaccio|             4|
|     Internazionale|             7|
|           Atalanta|             5|
|        FC Augsburg|             6|
|              Lazio|             6|
|              Parma|             5|
|          AS Monaco|             3|
|               Nice|             6|
|           Valencia|             6|
|               Caen|             4|
|            Catania|             3|
|              Lecce|             2|
|           Cagliari|             7|
|         Hamburg SV|             5|
|           Nurnberg|             3|
|  Borussia Dortmund|             5|
|            Sochaux|             4|
|     TSG Hoffenheim|             7|
+-------------------+--------------+
only showing top 20 rows



In [None]:
conceding_times = teams_conceding_1
df_to_join = [teams_conceding_2, teams_conceding_3, teams_conceding_4, teams_conceding_5, teams_conceding_6]

for DF in df_to_join:
  conceding_times = conceding_times.join(DF, conceding_times.opponent == DF.team, 'outer').drop('team')
conceding_times.show()

+-----------------+--------------+---------------+---------------+---------------+---------------+---------------+
|         opponent|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|
+-----------------+--------------+---------------+---------------+---------------+---------------+---------------+
|       AC Ajaccio|             4|              4|              6|              6|              6|              7|
|         AC Milan|             6|              7|              5|              7|              8|              8|
|       AJ Auxerre|             2|              2|              3|              1|              2|              3|
|        AS Monaco|             3|              4|              3|              3|              5|              6|
|AS Nancy Lorraine|             2|              4|              3|              4|              3|              6|
|          AS Roma|             5|              6|              6|              

In [None]:
goals = scoring_times.join(conceding_times, scoring_times.event_team == conceding_times.opponent, 'outer').drop('opponent')
goals.show()

+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+
|       event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|
+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+
|       AC Ajaccio|           4|            2|            4|            2|            3|            6|             4|              4|              6|              6|              6|              7|
|         AC Milan|           7|            8|            9|           11|            9|           13|             6|              7|              5|              7|              8|              8|
|       AJ

In [None]:
input_columns = goals.columns
input_columns.pop(0)
input_columns

['scored_0to15',
 'scored_15to30',
 'scored_30to45',
 'scored_45to60',
 'scored_60to75',
 'scored_75to90',
 'conceded_0to15',
 'conceded_15to30',
 'conceded_30to45',
 'conceded_45to60',
 'conceded_60to75',
 'conceded_75to90']

In [None]:
#Se genera la columna de features para los minutos con gole anotados
assembler = VectorAssembler(inputCols = input_columns, outputCol = "features")
goals_ml= assembler.transform(goals)
goals_ml.show()

+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+
|       event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|
+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+
|       AC Ajaccio|           4|            2|            4|            2|            3|            6|             4|              4|              6|              6|              6|              7|[4.0,2.0,4.0,2.0,...|
|         AC Milan|           7|            8|            9|           11|            9|           13|             6|       

In [None]:
#Se escalan las features
scaler = StandardScaler(inputCol= 'features', outputCol= 'features_scaled')
goals_ml = scaler.fit(goals_ml).transform(goals_ml)
goals_ml.show()

+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+
|       event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|
+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+
|       AC Ajaccio|           4|            2|            4|            2|            3|            6|             4|              4|              6|              6|              6|              7|[4.0,2.0,4.0,2.0,...|[1.62925428454702...|
|         AC Milan|           7|        

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans(featuresCol = 'features', k = 4)

model = kmeans.fit(goals_ml)

predictions = model.transform(goals_ml)


In [None]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 9.5 11.2 12.6 12.2 13.  16.2  4.5  4.7  5.5  5.3  5.5  6.9]
[ 5.52173913  6.52173913  7.32608696  7.39130435  7.52173913  9.82608696
  5.89130435  6.67391304  7.5         7.41304348  7.86956522 10.08695652]
[1.65957447 1.57446809 1.78723404 1.82978723 1.78723404 2.29787234
 2.0212766  2.44680851 2.85106383 2.72340426 2.76595745 3.40425532]
[3.71794872 4.23076923 4.8974359  4.61538462 5.15384615 6.41025641
 4.07692308 4.94871795 5.43589744 5.28205128 5.48717949 7.35897436]


In [None]:
predictions.show(20)

+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|       event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|prediction|
+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|       AC Ajaccio|           4|            2|            4|            2|            3|            6|             4|              4|              6|              6|              6|              7|[4.0,2.0,4.0,2.0,...|[1.62925428454702...|        

In [None]:
#Los mejores equipos, juegan a ganar
group_0 = predictions.filter(predictions.prediction == 0).select('event_team', 'prediction')
group_0.show(100)

+-------------------+----------+
|         event_team|prediction|
+-------------------+----------+
|            AS Roma|         0|
|    Atletico Madrid|         0|
|          Barcelona|         0|
|      Bayern Munich|         0|
|  Borussia Dortmund|         0|
|           Juventus|         0|
|               Lyon|         0|
|             Napoli|         0|
|Paris Saint-Germain|         0|
|        Real Madrid|         0|
+-------------------+----------+



In [None]:
#Equipos balanceados, pueden llegar a ganarle a los mejores
group_1 = predictions.filter(predictions.prediction == 1).select('event_team', 'prediction')
group_1.show(100)

+--------------------+----------+
|          event_team|prediction|
+--------------------+----------+
|            AC Milan|         1|
|            Atalanta|         1|
|     Athletic Bilbao|         1|
|    Bayer Leverkusen|         1|
|            Bordeaux|         1|
|Borussia Moncheng...|         1|
|            Cagliari|         1|
|          Celta Vigo|         1|
|       Chievo Verona|         1|
|            Espanyol|         1|
|         FC Augsburg|         1|
|          Fiorentina|         1|
|               Genoa|         1|
|              Getafe|         1|
|             Granada|         1|
|          Hamburg SV|         1|
|         Hannover 96|         1|
|      Internazionale|         1|
|               Lazio|         1|
|             Levante|         1|
|               Lille|         1|
|           Liverpool|         1|
|             Lorient|         1|
|               Mainz|         1|
|              Malaga|         1|
|     Manchester City|         1|
|           Ma

In [None]:
#Equipos que descienden
group_2 = predictions.filter(predictions.prediction == 2).select('event_team', 'prediction')
group_2.show(100)

+--------------------+----------+
|          event_team|prediction|
+--------------------+----------+
|          AJ Auxerre|         2|
|   AS Nancy Lorraine|         2|
|              Alaves|         2|
|             Almeria|         2|
|              Angers|         2|
|         Bournemouth|         2|
|               Brest|         2|
|             Burnley|         2|
|             Cardiff|         2|
|               Carpi|         2|
|              Cesena|         2|
|             Cordoba|         2|
|             Crotone|         2|
|           Dijon FCO|         2|
|               Eibar|         2|
|               Elche|         2|
|              Empoli|         2|
|    FC Ingolstadt 04|         2|
|  Fortuna Dusseldorf|         2|
|           Frosinone|         2|
|              Fulham|         2|
|         GFC Ajaccio|         2|
|                Hull|         2|
|      Kaiserslautern|         2|
|          Las Palmas|         2|
|               Lecce|         2|
|             

In [None]:
#Equipos debiles, por lo general juegan a no perder.
group_3 = predictions.filter(predictions.prediction == 3).select('event_team', 'prediction')
group_3.show(100)

+--------------------+----------+
|          event_team|prediction|
+--------------------+----------+
|          AC Ajaccio|         3|
|           AS Monaco|         3|
|             Arsenal|         3|
|         Aston Villa|         3|
|              Bastia|         3|
|             Bologna|         3|
|                Caen|         3|
|             Catania|         3|
|             Chelsea|         3|
|      Crystal Palace|         3|
| Deportivo La Coruna|         3|
| Eintracht Frankfurt|         3|
|             Everton|         3|
|Evian Thonon Gail...|         3|
|          FC Cologne|         3|
|            Guingamp|         3|
|       Hellas Verona|         3|
|       Hertha Berlin|         3|
|      Leicester City|         3|
|      Manchester Utd|         3|
|              Nantes|         3|
|           Newcastle|         3|
|            Nurnberg|         3|
|             Osasuna|         3|
|               Parma|         3|
|         SC Freiburg|         3|
|            S

In [None]:
model.save('/content/drive/My Drive/KM_PF')

#Pruebas para montar la aplicación en hugging face

In [None]:
from pyspark.ml.clustering import KMeansModel
model_loaded = KMeansModel.load('/content/drive/My Drive/KM_PF')

In [None]:
Test_model = model_loaded.transform(goals_ml)

In [None]:
Test_model.show()

+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|       event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|prediction|
+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|       AC Ajaccio|           4|            2|            4|            2|            3|            6|             4|              4|              6|              6|              6|              7|[4.0,2.0,4.0,2.0,...|[1.62925428454702...|        

In [None]:
Test_model.toPandas().to_csv('/content/drive/My Drive/DM_PF_clusters.csv', index=False)

In [6]:
clusters_csv = spark.read.csv(path = '/content/drive/My Drive/DM_PF_clusters.csv', header = True)
clusters_csv.show()

+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|       event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|prediction|
+-----------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|       AC Ajaccio|           4|            2|            4|            2|            3|            6|             4|              4|              6|              6|              6|              7|[4.0,2.0,4.0,2.0,...|[1.62925428454702...|        

In [None]:
Test_input = spark.createDataFrame([('Test_team', 8, 3, 12, 2, 5, 15, 3, 5, 3, 6, 2, 4)], ['event_team', 'scored_0to15', 'scored_15to30', 'scored_30to45', 'scored_45to60', 'scored_60to75', 'scored_75to90', 'conceded_0to15', 'conceded_15to30', 'conceded_30to45', 'conceded_45to60', 'conceded_60to75', 'conceded_75to90'])
Test_input.show()

+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+
|event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|
+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+
| Test_team|           8|            3|           12|            2|            5|           15|             3|              5|              3|              6|              2|              4|
+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+



In [None]:
input_columns = Test_input.columns
input_columns.pop(0)
input_columns
assembler = VectorAssembler(inputCols = input_columns, outputCol = "features")
Test_input_pred= assembler.transform(Test_input)
Test_input_pred.show()

+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+
|event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|
+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+
| Test_team|           8|            3|           12|            2|            5|           15|             3|              5|              3|              6|              2|              4|[8.0,3.0,12.0,2.0...|
+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+----------

In [None]:
Test_input_pred.printSchema()

root
 |-- event_team: string (nullable = true)
 |-- scored_0to15: long (nullable = true)
 |-- scored_15to30: long (nullable = true)
 |-- scored_30to45: long (nullable = true)
 |-- scored_45to60: long (nullable = true)
 |-- scored_60to75: long (nullable = true)
 |-- scored_75to90: long (nullable = true)
 |-- conceded_0to15: long (nullable = true)
 |-- conceded_15to30: long (nullable = true)
 |-- conceded_30to45: long (nullable = true)
 |-- conceded_45to60: long (nullable = true)
 |-- conceded_60to75: long (nullable = true)
 |-- conceded_75to90: long (nullable = true)
 |-- features: vector (nullable = true)



In [None]:
Test_model_2 = model_loaded.transform(Test_input_pred)

In [None]:
Test_model_2.show()

+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+----------+
|event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|prediction|
+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+----------+
| Test_team|           8|            3|           12|            2|            5|           15|             3|              5|              3|              6|              2|              4|[8.0,3.0,12.0,2.0...|         3|
+----------+------------+-------------+-------------+-------------+-------------+-------------+-------------

In [None]:
cluster_0 = clusters_csv.filter(clusters_csv.prediction == 0)
cluster_0.show()

+-------------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|         event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|prediction|
+-------------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|            AS Roma|          10|           10|           10|           10|           11|           14|             5|              6|              6|              7|              5|             10|[10.0,10.0,10.0,1...|[4.07313571136755...|

In [None]:
cluster_0_means = cluster_0.select(F.mean(cluster_0.scored_0to15), F.mean(cluster_0.scored_15to30), F.mean(cluster_0.scored_30to45), F.mean(cluster_0.scored_45to60), F.mean(cluster_0.scored_60to75), F.mean(cluster_0.scored_75to90), F.mean(cluster_0.conceded_0to15), F.mean(cluster_0.conceded_15to30), F.mean(cluster_0.conceded_30to45), F.mean(cluster_0.conceded_45to60), F.mean(cluster_0.conceded_60to75), F.mean(cluster_0.conceded_75to90))
cluster_0_means.show()

+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|avg(scored_0to15)|avg(scored_15to30)|avg(scored_30to45)|avg(scored_45to60)|avg(scored_60to75)|avg(scored_75to90)|avg(conceded_0to15)|avg(conceded_15to30)|avg(conceded_30to45)|avg(conceded_45to60)|avg(conceded_60to75)|avg(conceded_75to90)|
+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              9.5|              11.2|              12.6|              12.2|              13.0|              16.2|                4.5|                 4.7|                 5.5|                 5.3|                 5.5|                 6.9|
+-----------------+------------------+--

In [None]:
cluster_1 = clusters_csv.filter(clusters_csv.prediction == 1)
cluster_1.show()

+--------------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|          event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|prediction|
+--------------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|            AC Milan|           7|            8|            9|           11|            9|           13|             6|              7|              5|              7|              8|              8|[7.0,8.0,9.0,11.0...|[2.85119499795728

In [None]:
cluster_1_means = cluster_1.select(F.mean(cluster_1.scored_0to15), F.mean(cluster_1.scored_15to30), F.mean(cluster_1.scored_30to45), F.mean(cluster_1.scored_45to60), F.mean(cluster_1.scored_60to75), F.mean(cluster_1.scored_75to90), F.mean(cluster_1.conceded_0to15), F.mean(cluster_1.conceded_15to30), F.mean(cluster_1.conceded_30to45), F.mean(cluster_1.conceded_45to60), F.mean(cluster_1.conceded_60to75), F.mean(cluster_1.conceded_75to90))
cluster_1_means.show()

+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|avg(scored_0to15)|avg(scored_15to30)|avg(scored_30to45)|avg(scored_45to60)|avg(scored_60to75)|avg(scored_75to90)|avg(conceded_0to15)|avg(conceded_15to30)|avg(conceded_30to45)|avg(conceded_45to60)|avg(conceded_60to75)|avg(conceded_75to90)|
+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|5.521739130434782| 6.521739130434782| 7.326086956521739| 7.391304347826087| 7.521739130434782| 9.826086956521738|  5.891304347826087|   6.673913043478261|                 7.5|   7.413043478260869|   7.869565217391305|   10.08695652173913|
+-----------------+------------------+--

In [None]:
cluster_2 = clusters_csv.filter(clusters_csv.prediction == 2)
cluster_2.show()

+------------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|        event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|prediction|
+------------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|        AJ Auxerre|           1|            1|            2|            1|            2|            3|             2|              2|              3|              1|              2|              3|[1.0,1.0,2.0,1.0,...|[0.40731357113675...|    

In [None]:
cluster_2_means = cluster_2.select(F.mean(cluster_2.scored_0to15), F.mean(cluster_2.scored_15to30), F.mean(cluster_2.scored_30to45), F.mean(cluster_2.scored_45to60), F.mean(cluster_2.scored_60to75), F.mean(cluster_2.scored_75to90), F.mean(cluster_2.conceded_0to15), F.mean(cluster_2.conceded_15to30), F.mean(cluster_2.conceded_30to45), F.mean(cluster_2.conceded_45to60), F.mean(cluster_2.conceded_60to75), F.mean(cluster_2.conceded_75to90))
cluster_2_means.show()

+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| avg(scored_0to15)|avg(scored_15to30)|avg(scored_30to45)|avg(scored_45to60)|avg(scored_60to75)|avg(scored_75to90)|avg(conceded_0to15)|avg(conceded_15to30)|avg(conceded_30to45)|avg(conceded_45to60)|avg(conceded_60to75)|avg(conceded_75to90)|
+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|1.6595744680851063| 1.574468085106383|1.7872340425531914|1.8297872340425532|1.7872340425531914| 2.297872340425532|  2.021276595744681|  2.4468085106382977|   2.851063829787234|   2.723404255319149|   2.765957446808511|   3.404255319148936|
+------------------+----------------

In [None]:
cluster_3 = clusters_csv.filter(clusters_csv.prediction == 3)
cluster_3.show()

+--------------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|          event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|prediction|
+--------------------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|          AC Ajaccio|           4|            2|            4|            2|            3|            6|             4|              4|              6|              6|              6|              7|[4.0,2.0,4.0,2.0,...|[1.62925428454702

In [None]:
cluster_3_means = cluster_3.select(F.mean(cluster_3.scored_0to15), F.mean(cluster_3.scored_15to30), F.mean(cluster_3.scored_30to45), F.mean(cluster_3.scored_45to60), F.mean(cluster_3.scored_60to75), F.mean(cluster_3.scored_75to90), F.mean(cluster_3.conceded_0to15), F.mean(cluster_3.conceded_15to30), F.mean(cluster_3.conceded_30to45), F.mean(cluster_3.conceded_45to60), F.mean(cluster_3.conceded_60to75), F.mean(cluster_3.conceded_75to90))
cluster_3_means.show()

+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|avg(scored_0to15)|avg(scored_15to30)|avg(scored_30to45)|avg(scored_45to60)|avg(scored_60to75)|avg(scored_75to90)|avg(conceded_0to15)|avg(conceded_15to30)|avg(conceded_30to45)|avg(conceded_45to60)|avg(conceded_60to75)|avg(conceded_75to90)|
+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|3.717948717948718| 4.230769230769231| 4.897435897435898| 4.615384615384615| 5.153846153846154| 6.410256410256411|  4.076923076923077|   4.948717948717949|   5.435897435897436|   5.282051282051282|   5.487179487179487|   7.358974358974359|
+-----------------+------------------+--

In [None]:
clusters_means = cluster_0_means
clusters_means.show()

+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|avg(scored_0to15)|avg(scored_15to30)|avg(scored_30to45)|avg(scored_45to60)|avg(scored_60to75)|avg(scored_75to90)|avg(conceded_0to15)|avg(conceded_15to30)|avg(conceded_30to45)|avg(conceded_45to60)|avg(conceded_60to75)|avg(conceded_75to90)|
+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              9.5|              11.2|              12.6|              12.2|              13.0|              16.2|                4.5|                 4.7|                 5.5|                 5.3|                 5.5|                 6.9|
+-----------------+------------------+--

In [None]:
clusters_means = clusters_means.union(cluster_1_means)
clusters_means = clusters_means.union(cluster_2_means)
clusters_means = clusters_means.union(cluster_3_means)
clusters_means.show()

+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| avg(scored_0to15)|avg(scored_15to30)|avg(scored_30to45)|avg(scored_45to60)|avg(scored_60to75)|avg(scored_75to90)|avg(conceded_0to15)|avg(conceded_15to30)|avg(conceded_30to45)|avg(conceded_45to60)|avg(conceded_60to75)|avg(conceded_75to90)|
+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               9.5|              11.2|              12.6|              12.2|              13.0|              16.2|                4.5|                 4.7|                 5.5|                 5.3|                 5.5|                 6.9|
| 5.521739130434782| 6.5217391304347

In [None]:
clusters_means.toPandas().to_csv('/content/drive/My Drive/clusters_means.csv', index=False)

In [None]:
clusters_means_csv = spark.read.csv(path = '/content/drive/My Drive/clusters_means.csv', header = True)
clusters_means_csv.show()

+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| avg(scored_0to15)|avg(scored_15to30)|avg(scored_30to45)|avg(scored_45to60)|avg(scored_60to75)|avg(scored_75to90)|avg(conceded_0to15)|avg(conceded_15to30)|avg(conceded_30to45)|avg(conceded_45to60)|avg(conceded_60to75)|avg(conceded_75to90)|
+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               9.5|              11.2|              12.6|              12.2|              13.0|              16.2|                4.5|                 4.7|                 5.5|                 5.3|                 5.5|                 6.9|
| 5.521739130434782| 6.5217391304347

In [None]:
a = clusters_means.collect()[0]
a

Row(avg(scored_0to15)=9.5, avg(scored_15to30)=11.2, avg(scored_30to45)=12.6, avg(scored_45to60)=12.2, avg(scored_60to75)=13.0, avg(scored_75to90)=16.2, avg(conceded_0to15)=4.5, avg(conceded_15to30)=4.7, avg(conceded_30to45)=5.5, avg(conceded_45to60)=5.3, avg(conceded_60to75)=5.5, avg(conceded_75to90)=6.9)

In [None]:
a[0]

9.5

In [12]:
b = list(clusters_csv.select('event_team').toPandas()['event_team'])
print(b)

['AC Ajaccio', 'AC Milan', 'AJ Auxerre', 'AS Monaco', 'AS Nancy Lorraine', 'AS Roma', 'Alaves', 'Almeria', 'Angers', 'Arsenal', 'Aston Villa', 'Atalanta', 'Athletic Bilbao', 'Atletico Madrid', 'Barcelona', 'Bastia', 'Bayer Leverkusen', 'Bayern Munich', 'Bologna', 'Bordeaux', 'Borussia Dortmund', 'Borussia Monchengladbach', 'Bournemouth', 'Brest', 'Burnley', 'Caen', 'Cagliari', 'Cardiff', 'Carpi', 'Catania', 'Celta Vigo', 'Cesena', 'Chelsea', 'Chievo Verona', 'Cordoba', 'Crotone', 'Crystal Palace', 'Deportivo La Coruna', 'Dijon FCO', 'Eibar', 'Eintracht Frankfurt', 'Elche', 'Empoli', 'Espanyol', 'Everton', 'Evian Thonon Gaillard', 'FC Augsburg', 'FC Cologne', 'FC Ingolstadt 04', 'Fiorentina', 'Fortuna Dusseldorf', 'Frosinone', 'Fulham', 'GFC Ajaccio', 'Genoa', 'Getafe', 'Granada', 'Guingamp', 'Hamburg SV', 'Hannover 96', 'Hellas Verona', 'Hertha Berlin', 'Hull', 'Internazionale', 'Juventus', 'Kaiserslautern', 'Las Palmas', 'Lazio', 'Lecce', 'Leganes', 'Leicester City', 'Lens', 'Levante'

[Column<'event_team'>]


In [13]:
c = clusters_csv.filter(clusters_csv['event_team'] == 'Alaves')
c.show()

+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|event_team|scored_0to15|scored_15to30|scored_30to45|scored_45to60|scored_60to75|scored_75to90|conceded_0to15|conceded_15to30|conceded_30to45|conceded_45to60|conceded_60to75|conceded_75to90|            features|     features_scaled|prediction|
+----------+------------+-------------+-------------+-------------+-------------+-------------+--------------+---------------+---------------+---------------+---------------+---------------+--------------------+--------------------+----------+
|    Alaves|           1|            1|            1|            1|            1|            1|             0|              1|              1|              1|              1|              2|[1.0,1.0,1.0,1.0,...|[0.40731357113675...|         2|
+----------+------------

In [19]:
d = c.collect()[0]
print(d)

Row(event_team='Alaves', scored_0to15='1', scored_15to30='1', scored_30to45='1', scored_45to60='1', scored_60to75='1', scored_75to90='1', conceded_0to15='0', conceded_15to30='1', conceded_30to45='1', conceded_45to60='1', conceded_60to75='1', conceded_75to90='2', features='[1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0]', features_scaled='[0.40731357113675515,0.3355236448163077,0.2944322264782099,0.3026139324231408,0.2893013411012911,0.23055490463466763,0.0,0.4832502859328248,0.43276150855991563,0.4313756884340112,0.41002525822739155,0.6300871646251658]', prediction='2')
