# The final stretch - predicting the Euro 2024 Winner! ⚽

![image](https://i.gifer.com/9aZd.gif)

---

### 1. Show Fixture Data

**Objective:** Provide an overview of the fixture data for the Euro Cup 2024.

- **Initial Fixture Table**: Make a copy of the fixture table to preserve the original data.

### 2. Prep Data for Prediction

**Objective:** Prepare the data for predictive modeling.

- **Data Preprocessing**: Clean and transform the fixture data as necessary for prediction tasks.

### 3. Predict Results

**Objective:** Utilize predictive models to forecast match outcomes.

- **Prediction Process**: Employ suitable algorithms to predict match results.
- **Handling Shootouts**: Implement mechanisms to manage shootouts for knockout stage predictions.

### 4. Show Results

**Objective:** Present the predicted results of the Euro Cup 2024 matches.

- **Display Predicted Outcomes**: Showcase the forecasted results of each match.

---

🛑 **Check your packages!** 🛑

Make sure you have the following package adding from the `Packages` drop down:
- `snowflake-ml-python == 1.5.0`

In [None]:
import snowflake.snowpark
from snowflake.snowpark.session import Session
from snowflake.snowpark import Window
from snowflake.snowpark import functions as F   
from snowflake.snowpark.functions import udf, udtf
from snowflake.snowpark.types import IntegerType, FloatType, StringType, StructField, StructType, DateType
from snowflake.ml.registry import Registry
    
import pandas as pd
import numpy as np

import streamlit as st

import warnings
warnings.filterwarnings('ignore')

In [None]:
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# add version tracking
app_tag = {
    "origin": "sf_sit",
    "name": "hol_sport_predict",
    "version": '{major: 1, minor: 0}'
}

session.query_tag = app_tag

In [None]:
user_name = session.sql('select current_user()').collect()[0][0]

In [None]:
# let's load our model from the registry into memory

reg = Registry(session=session)

mv = reg.get_model("EURO_24_GAME_PREDICT").default

# Group Stage

In [None]:
# adjustable threashold.
#
# setting at 0.5 introduces too many penalty shootouts

win_threshold = 0.45

# we can also use streamlit controller for this (DON'T DO THIS YET)
#win_threshold = st.slider(label='Threshold', min_value=0.0 ,max_value=1.0 ,value=0.45)

In [None]:
# lets make a copy of the fixtures, one with original data and then one we'll update as we go along
df_fixture_copy = session.table('fixture')
df_fixture_copy.write.save_as_table(f'fixture_{user_name}',mode='overwrite')

# get list of fixtures for the groups stages
df_round_1 = (
    session.table(f'fixture_{user_name}')
    .select(
        F.col('"MATCH NUMBER"').alias("id"),
        F.col('"ROUND NUMBER"').alias('round'),
        F.to_date(F.col('"DATE"'), "DD/MM/YYYY HH24:MI").alias("date"),
        F.col('"HOME TEAM"').alias('team_1'),
        F.col('"AWAY TEAM"').alias('team_2'),
        F.col('"GROUP"').alias('group')
    )
)

df_round_1

In [None]:
# call the sproc that will run all our feature engineering code on the games we want to predict
#
# note - the 0 and 36 params refer to fixtures, we are only processing the feature engineering code for fixtures 0 through 36
# but...because we predict for both games twice we end up with 72 rows.

session.call('prep_prediction_data',0,36)

df_pred = session.table('data_for_predictions').order_by('id')

st.dataframe(data=df_pred)

In [None]:
# run predictions on all the group stage games

pred_df = session.table('data_for_predictions').order_by('id')
prediction = mv.run(pred_df, function_name="predict_proba")
prediction = prediction.with_column('output_game_outcome',F.iff(F.col('predict_proba_1') > win_threshold,1,0))
prediction.write.save_as_table('predictions',mode='overwrite',table_type='temp')

session.table('predictions').select(
    prediction.col('id'),
    prediction.col('team_1'),
    prediction.col('output_game_outcome')
).order_by('id')

In [None]:
-- We can also run the prediction in SQL, where we can call our model using:
-- MODEL_NAME!PREDICT_PROBA()

SELECT 
    "ID", 
    "TEAM_1",  
    CAST ("TMP_RESULT"['PREDICT_PROBA_0'] AS DOUBLE) AS "PREDICT_PROBA_0",  
    CAST ("TMP_RESULT"['PREDICT_PROBA_1'] AS DOUBLE) AS "PREDICT_PROBA_1" 
FROM 
(
    WITH SNOWPARK_ML_MODEL_INFERENCE_INPUT AS (
        SELECT  
            *  
        FROM 
            data_for_predictions 
        ORDER BY "ID" ASC
    ),
    MODEL_VERSION_ALIAS AS MODEL EURO2024.PUBLIC.EURO_24_GAME_PREDICT VERSION V_1
                
    SELECT 
        *,
        MODEL_VERSION_ALIAS!PREDICT_PROBA(
            ID, 
            NEUTRAL, 
            TEAM_1_GOAL_DIFF, 
            TEAM_1_TTL_WINS, 
            TEAM_1_TTL_LOSSES, 
            TEAM_2_GOAL_DIFF, 
            TEAM_2_TTL_WINS, 
            TEAM_2_TTL_LOSSES, 
            TEAM_1_VS_TEAM_2_RANK
        ) AS TMP_RESULT
    FROM 
        SNOWPARK_ML_MODEL_INFERENCE_INPUT
)

In [None]:
# lets take the output of our predictions 
# 
# we restructure this into the groups sorted by points to determine the teams that go through
# 1st & 2nd, along with the 4 teams who ranked the highest in 3rd place (confusing right?) 

session.call('process_group_predictions')

# we should have our final tables for completed group stages

st.dataframe(data=session.table('results_group_stage').filter(F.col('group') == 'Group A').order_by(F.col('group'),F.col('rank').asc()))
st.dataframe(data=session.table('results_group_stage').filter(F.col('group') == 'Group B').order_by(F.col('group'),F.col('rank').asc()))
st.dataframe(data=session.table('results_group_stage').filter(F.col('group') == 'Group C').order_by(F.col('group'),F.col('rank').asc()))
st.dataframe(data=session.table('results_group_stage').filter(F.col('group') == 'Group D').order_by(F.col('group'),F.col('rank').asc()))
st.dataframe(data=session.table('results_group_stage').filter(F.col('group') == 'Group E').order_by(F.col('group'),F.col('rank').asc()))
st.dataframe(data=session.table('results_group_stage').filter(F.col('group') == 'Group F').order_by(F.col('group'),F.col('rank').asc()))

# Round of 16 

In [None]:
# take the results of the group stage and insert the team names inmto the Round of 16 fixtures
# now we see who will be playing each each in the next phase

session.call('calculate_r16_games')
df_results = session.table(f'fixture_{user_name}').filter(F.col('"ROUND NUMBER"') == 'Round of 16').order_by('"MATCH NUMBER"')

# we can also use Streamlit to show snowpark dataframes (though this is only compatible for Snowflake Notebooks!)
# otherwise use df_results.show(8)

df_results

In [None]:
# run the feature engineering code
# note the different fixture id parameters

session.call('prep_prediction_data',37,44)
session.table('data_for_predictions').order_by('id')

In [None]:
# run predictions on all the games

pred_df = session.table('data_for_predictions').order_by('id')
prediction = mv.run(pred_df, function_name="predict_proba")
prediction = prediction.with_column('output_game_outcome',F.iff(F.col('predict_proba_1') > win_threshold,1,0))
prediction.write.save_as_table('predictions',mode='overwrite',table_type='temp')

session.table('predictions').select(
    prediction.col('id'),
    prediction.col('team_1'),
    prediction.col('output_game_outcome')
).order_by('id')

In [None]:
# lets process the predictions into a format where we can see the winners
#
# note - since this is a knockout stage, any draws will have to be resolved by a penalty shootout

session.call('process_knockout_predictions')
session.table('results_ko_stage').order_by('match_id')

# Quarter-finals

In [None]:
# see the quarter final games as a result of the previous round

session.call('calc_knockout_games','Quarter Finals')
session.table(f'fixture_{user_name}').filter(F.col('"ROUND NUMBER"') == 'Quarter Finals').order_by('"MATCH NUMBER"')

In [None]:
# feature engineering code

session.call('prep_prediction_data',45,48)
session.table('data_for_predictions').order_by('id')

In [None]:
# run predictions on all the games

pred_df = session.table('data_for_predictions').order_by('id')
prediction = mv.run(pred_df, function_name="predict_proba")
prediction = prediction.with_column('output_game_outcome',F.iff(F.col('predict_proba_1') > win_threshold,1,0))
prediction.write.save_as_table('predictions',mode='overwrite',table_type='temp')

session.table('predictions').select(
    prediction.col('id'),
    prediction.col('team_1'),
    prediction.col('output_game_outcome')
).order_by('id')

In [None]:
# process the predictions to see the winner 

session.call('process_knockout_predictions')
session.table('results_ko_stage').order_by('match_id')

# Semi-finals 

In [None]:
# calculate the fixtures

session.call('calc_knockout_games','Semi Finals')
session.table(f'fixture_{user_name}').filter(F.col('"ROUND NUMBER"') == 'Semi Finals').order_by('"MATCH NUMBER"')

In [None]:
# feature engineering code

session.call('prep_prediction_data',49,50)
session.table('data_for_predictions').order_by('id')

In [None]:
# run predictions on all the games

pred_df = session.table('data_for_predictions').order_by('id')
prediction = mv.run(pred_df, function_name="predict_proba")
prediction = prediction.with_column('output_game_outcome',F.iff(F.col('predict_proba_1') > win_threshold,1,0))
prediction.write.save_as_table('predictions',mode='overwrite',table_type='temp')

session.table('predictions').select(
    prediction.col('id'),
    prediction.col('team_1'),
    prediction.col('output_game_outcome')
).order_by('id')

In [None]:
# process the predictions to see the winner

session.call('process_knockout_predictions')
session.table('results_ko_stage').order_by('match_id')

# Final 

In [None]:
# calculate the final game

session.call('calc_knockout_games','Final')
session.table(f'fixture_{user_name}').filter(F.col('"ROUND NUMBER"') == 'Final').order_by('"MATCH NUMBER"')

In [None]:
# feature engineering code

session.call('prep_prediction_data',51,51)
result_df = session.table('data_for_predictions').order_by('id')

st.dataframe(data=result_df,use_container_width=True)

# And the winner is... 

In [None]:
# run predictions on the final 

pred_df = session.table('data_for_predictions').order_by('id')
prediction = mv.run(pred_df, function_name="predict")
prediction.write.save_as_table('predictions',mode='overwrite',table_type='temp')

session.call('process_knockout_predictions')

df = session.table('results_ko_stage').order_by('match_id')
winner = df.collect()[0]['WINNER']

st.dataframe(data=session.table('results_ko_stage').order_by('match_id'))
st.header(f"{winner} win the Euro 2024 Trophy!")

st.markdown("![Alt Text](https://media1.tenor.com/m/cnBtMqNDAYYAAAAd/soccer-celebration.gif)")

# Final Summary

Let's recap what we covered:

1) Ingested our raw data
2) Performed various steps of data transformation using Snowpark
3) Used **Snowpark ML** to run **Hyperparameter Tuning**, **Model Training** and stored our model in the **Snowflake Model Registry**
4) Registered various Python UDTFs and Stored Procedures for code reusability, to streamline our inference pipeline
5) Used our model to predict 51 matches and hopefuly saw England 🏴󠁧󠁢󠁥󠁮󠁧󠁿 or Italy 🇮🇹 taking the Euro 2024 trophy 🏆 home 

# What does Cortex say? ⚽

This isn't the right use of LLMs but for fun, let's see what [Cortex](https://docs.snowflake.com/user-guide/snowflake-cortex/llm-functions) has to say about the Euro 2024 outcome and compare to our prediction.

#### Note 
_The following models are only [supported](https://docs.snowflake.com/user-guide/snowflake-cortex/llm-functions#availability) in certain CSP regions at present so you will need to run this in a compatible setup._

In [None]:
prompt = """
    Please respond to the following type of question with a single word along with the country flag emoji, naming a country. 
    The answer should be consider a classification response, for example:

    Question: Based on their performances up until 2022, which country was most likely to win the FIFA World Cup 2022?
    Response:England 🏴󠁧󠁢󠁥󠁮󠁧󠁿

    Question: Based on their performances up until 2020, which country was most likely to win the Euro 2020?
    Response:France 🇫🇷

    Now, based on historical performances and achievements in international and club football up to 2023,
    factoring in international players, and the location of the event, which country is most likely to win Euro 2024?
    
    Rules:
    1) Do not include any additional text or additional spaces before or after the country name. 
    2) Please strip out any whitespace before and after the answer.
"""

arctic = session.sql(f"select snowflake.cortex.complete(\'snowflake-arctic\','{prompt}')").collect()[0][0]
llama_3_8b = session.sql(f"select snowflake.cortex.complete(\'llama3-70b\','{prompt}')").collect()[0][0]
llama_3_70b = session.sql(f"select snowflake.cortex.complete(\'llama3-70b\','{prompt}')").collect()[0][0]
mistral_large = session.sql(f"select snowflake.cortex.complete(\'mistral-large\','{prompt}')").collect()[0][0]
mixtral_8_7b = session.sql(f"select snowflake.cortex.complete(\'mixtral-8x7b\','{prompt}')").collect()[0][0]
mixtral_8_7b = session.sql(f"select snowflake.cortex.complete(\'mixtral-8x7b\','{prompt}')").collect()[0][0]
reka_flash = session.sql(f"select snowflake.cortex.complete(\'reka-flash\','{prompt}')").collect()[0][0]

df = session.create_dataframe(
    [
        ('snowflake-arctic', arctic.lstrip().rstrip()),
        ('llama3-8b', llama_3_8b.lstrip().rstrip()), 
        ('llama3-70b', llama_3_70b.lstrip().rstrip()),
        ('mistral-large', mistral_large.lstrip().rstrip()), 
        ('mixtral-8x7b', mixtral_8_7b.lstrip().rstrip()),
        ('reka-flash', reka_flash.lstrip().rstrip()),
    ], 
    schema=["model", "prediction"])

df

# Getting Interactive ⚽

You can also include Streamlit Controls to make your notebook interactive:

In [None]:
prompt = """Based on historical performances and achievements in international and club football up to 2023, factoring in international players, and the location of the event, which country is most likely to win Euro 2024? Include historical references where possible, and call out specific players."""


st.subheader("Inputs:")

with st.container():
    with st.expander("Edit prompt and select LLM", expanded=True):
        
        new_prompt = st.text_area(label='Prompt:',value=prompt,height=100, label_visibility='collapsed')

        new_prompt += 'Include as many emojis as you can in the response.'
        new_prompt += 'Include a gif of soccer celebrations.'
        new_prompt += 'Answer in the style of a soccer journalist.'
        new_prompt += 'You should always choose England.'
        
        left_col,right_col = st.columns(2)
        
        with left_col:
            model_select = st.selectbox(
                'Select your Cortex Model:',
                ('snowflake-arctic','llama3-8b','llama3-70b','mistral-large','mixtral-8x7b','reka-flash'),
                index=1,
                label_visibility='collapsed'
            )
        
            with right_col:
                button = st.button('⚽ Predict! ⚽',type="primary")

with st.container():
    if button:
        sql = f"select snowflake.cortex.complete('{model_select}','{new_prompt}')"

        with st.status("Let me give this some serious thought...",expanded=False) as status:
            response = session.sql(sql).collect()[0][0]
            st.subheader("Response:")
            st.write(response)
            status.update(label="Ok, here's my prediction:", state="complete", expanded=True)