# Expected Goal Model using logistic regression

In [1]:
import sys
import os
import pandas as pd
import numpy as np

import math

project_root = os.path.abspath("..")

if project_root not in sys.path:
    sys.path.append(project_root)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import pyodbc
from sqlalchemy import create_engine
from preprocessing.event_data import add_position_in_meters


In [2]:
WINDOWS_HOST_IP = '172.24.176.1'
SERVER_PORT = 1433


query = """
WITH TAGS AS (
    SELECT 
        ET.eventRecordID,

        -- Flag cho các loại tag:
        MAX(CASE WHEN TN.Description = 'Goal' THEN 1 END) AS Goal,
        MAX(CASE WHEN TN.Description = 'Own goal' THEN 1 END) AS OwnGoal,
        MAX(CASE WHEN TN.Description = 'Counter attack' THEN 1 END) AS CounterAttack,

        -- Body part: lấy tagID (401,402,403) của cú dứt điểm
        MAX(CASE WHEN ET.tagID IN (401, 402, 403) THEN ET.tagID END) AS BodyPartTagID
    FROM EVENTTAGS ET
    LEFT JOIN TAGSNAME TN
        ON ET.tagID = TN.tagID
    GROUP BY ET.eventRecordID
)

SELECT
    EV.matchID,
    EV.matchPeriod,
    EV.eventSec,
    EN.eventName,
    EN.subEventName,
    EV.teamID,
    EV.posOrigX,
    EV.posOrigY,
    EV.posDestX,
    EV.posDestY,

    EV.playerID,
    P.Sname AS playerName,
    P.Prole AS playerPosition,
    P.foot AS playerStrongFoot,

    EV.teamID AS teamPossession,
    MT_home.teamID AS homeTeamId,
    MT_away.teamID AS awayTeamId,

    -- Flags
    ISNULL(T.Goal, 0) AS Goal,
    ISNULL(T.OwnGoal, 0) AS OwnGoal,
    ISNULL(T.CounterAttack, 0) AS CounterAttack,

    -----------------------------------------
    -- Body Part (KHÔNG nhân dòng)
    -----------------------------------------
    CASE
        WHEN T.BodyPartTagID = 401 THEN 'leftFoot'
        WHEN T.BodyPartTagID = 402 THEN 'rightFoot'
        WHEN T.BodyPartTagID = 403 THEN 'head/body'
        ELSE NULL
    END AS bodyPartShot,

    -- Body part code cho xG
    CASE
        WHEN T.BodyPartTagID = 403 THEN 0
        WHEN T.BodyPartTagID = 401 AND P.foot = 'right' THEN 1 --Weak foot
        WHEN T.BodyPartTagID = 402 AND P.foot = 'left' THEN 1 --Weak foot
        WHEN T.BodyPartTagID IN (401, 402) THEN 2 --Strong foot
        ELSE NULL
    END AS bodyPartShotCode

FROM EVENTS AS EV

-- Event name
LEFT JOIN EVENTSNAME AS EN
    ON EV.subEventID = EN.subEventID

-- Player
LEFT JOIN PLAYERS AS P
    ON EV.playerID = P.playerID

-- Home / Away teams
LEFT JOIN MATCHTEAMS AS MT_home
    ON EV.matchID = MT_home.matchID AND MT_home.side = 'home'

LEFT JOIN MATCHTEAMS AS MT_away
    ON EV.matchID = MT_away.matchID AND MT_away.side = 'away'

-- JOIN TAGS duy nhất → không bao giờ nhân dòng
LEFT JOIN TAGS AS T
    ON EV.eventRecordID = T.eventRecordID

ORDER BY EV.matchID, EV.eventSec;
"""


# Định dạng URI
# 'mssql+pyodbc://<UID>:<PWD>@<DSN>' hoặc
# 'mssql+pyodbc:///?odbc_connect=<CONNECTION_STRING_ĐƯỢC_URL_ENCODE>'

sql_uri = (
    f"mssql+pyodbc://WSL2_SQL_ServerPort_1433:Dat23012003"
    f"@{WINDOWS_HOST_IP}:{SERVER_PORT}/csdl_Soccer"
    f"?driver=ODBC+Driver+18+for+SQL+Server"
    f"&TrustServerCertificate=yes"
)

engine = create_engine(sql_uri)

# Kết nối và Truy vấn
try:
    # Sử dụng engine trực tiếp trong read_sql
    df= pd.read_sql(query, engine)
    print("Truy vấn thành công bằng SQLAlchemy!")
except Exception as e:
    print("Lỗi khi sử dụng SQLAlchemy:", e)

Truy vấn thành công bằng SQLAlchemy!


In [3]:
df_events = add_position_in_meters(df_events=df, cols_length=["posOrigX", "posDestX"], cols_width=["posOrigY", "posDestY"], field_length=105, field_width=68).copy()
df_events.head()

Unnamed: 0,matchID,matchPeriod,eventSec,eventName,subEventName,teamID,playerID,playerName,playerPosition,playerStrongFoot,...,awayTeamId,Goal,OwnGoal,CounterAttack,bodyPartShot,bodyPartShotCode,posOrigXMeters,posDestXMeters,posOrigYMeters,posDestYMeters
0,1694390,2H,0.814,Pass,Simple pass,11944,83753.0,N. Stanciu,Midfielder,right,...,11944,0,0,0,,,51.45,51.45,32.64,32.64
1,1694390,2H,0.814,Pass,Simple pass,11944,6165.0,F. Andone,Forward,right,...,11944,0,0,0,,,40.95,40.95,34.0,34.0
2,1694390,1H,1.25599,Pass,Simple pass,4418,26010.0,O. Giroud,Forward,left,...,11944,0,0,0,,,52.5,49.35,32.64,34.0
3,1694390,1H,2.351908,Pass,Simple pass,4418,3682.0,A. Griezmann,Forward,left,...,11944,0,0,0,,,49.35,43.05,34.0,32.64
4,1694390,2H,2.677,Pass,High pass,11944,83824.0,M. Pintilii,Midfielder,right,...,11944,0,0,0,,,72.45,32.55,9.52,58.48


In [4]:
df_shots = df_events[df_events['eventName'] == 'Shot'].copy()

In [5]:
df_shots.head()

Unnamed: 0,matchID,matchPeriod,eventSec,eventName,subEventName,teamID,playerID,playerName,playerPosition,playerStrongFoot,...,awayTeamId,Goal,OwnGoal,CounterAttack,bodyPartShot,bodyPartShotCode,posOrigXMeters,posDestXMeters,posOrigYMeters,posDestYMeters
24,1694390,1H,31.226217,Shot,Shot,4418,25437.0,B. Matuidi,Midfielder,left,...,11944,0,0,0,rightFoot,1.0,95.55,0.0,19.72,0.0
61,1694390,2H,100.604872,Shot,Shot,11944,83753.0,N. Stanciu,Midfielder,right,...,11944,0,0,0,rightFoot,2.0,105.0,21.0,68.0,45.56
85,1694390,2H,130.592908,Shot,Shot,11944,33235.0,B. Stancu,Forward,right,...,11944,0,0,0,rightFoot,2.0,105.0,0.0,68.0,0.0
89,1694390,1H,143.119551,Shot,Shot,11944,83824.0,M. Pintilii,Midfielder,right,...,11944,0,0,0,rightFoot,2.0,74.55,105.0,19.72,68.0
137,1694390,1H,219.576026,Shot,Shot,11944,33235.0,B. Stancu,Forward,right,...,11944,0,0,0,rightFoot,2.0,100.8,105.0,38.76,68.0


In [6]:
total_shots = len(df_shots)
print(f"Tổng số cú sút: {total_shots}")
print(f"Xác suất ghi bàn khi thực hiện cú sút: {df_shots['Goal'].mean()*100:.2f}%")

Tổng số cú sút: 43071
Xác suất ghi bàn khi thực hiện cú sút: 10.42%


In [7]:
df[(df['subEventName'] == 'Goal kick') & (df['playerID'] == 50849)].head(20)

Unnamed: 0,matchID,matchPeriod,eventSec,eventName,subEventName,teamID,posOrigX,posOrigY,posDestX,posDestY,...,playerPosition,playerStrongFoot,teamPossession,homeTeamId,awayTeamId,Goal,OwnGoal,CounterAttack,bodyPartShot,bodyPartShotCode
1974,1694391,2H,594.258748,Free Kick,Goal kick,8731,59,67,59,67,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
2192,1694391,2H,972.681271,Free Kick,Goal kick,8731,67,62,67,62,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
2193,1694391,1H,988.950778,Free Kick,Goal kick,8731,67,67,67,67,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
2386,1694391,2H,1352.070483,Free Kick,Goal kick,8731,61,73,39,27,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
2637,1694391,2H,1821.159704,Free Kick,Goal kick,8731,43,67,43,67,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
2841,1694391,2H,2197.216352,Free Kick,Goal kick,8731,64,74,64,74,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
2921,1694391,2H,2359.381294,Free Kick,Goal kick,8731,22,93,22,93,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
2953,1694391,1H,2404.978511,Free Kick,Goal kick,8731,63,73,37,27,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
2976,1694391,1H,2456.406848,Free Kick,Goal kick,8731,61,71,39,29,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
3125,1694391,1H,2745.455884,Free Kick,Goal kick,8731,58,88,42,12,...,Goalkeeper,left,8731,8731,6697,0,0,0,,
