In [1]:
import geopy as geopy
from scipy.special import gamma
import numpy as np
import matplotlib.pyplot as plt
from functools import lru_cache
import math as math
from math import gamma
import datetime
import pandas as pd

## Translate Stadium to latitude and longitude to calculate the distance

In [2]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from geopy.distance import geodesic

def get_coordinates(stadium_name):
    api_key = "AIzaSyCFVZQiqRj_z6MGgCDjEK6imywq6rQj8yk"
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={stadium_name}&key={api_key}"

    response = requests.get(url)
    data = response.json()

    if data["status"] == "OK":
        # Extracting the latitude and longitude coordinates
        latitude = data["results"][0]["geometry"]["location"]["lat"]
        longitude = data["results"][0]["geometry"]["location"]["lng"]
        return (latitude, longitude)
    else:
        return None

def calculate_distance(home_stadium_name, away_stadium_name):
    # Getting the coordinates of the home stadium
    home_coordinates = get_coordinates(home_stadium_name)
    if home_coordinates is None:
        print(f"Unable to find coordinates for {home_stadium_name}.")
        return None

    # Getting the coordinates of the away stadium
    away_coordinates = get_coordinates(away_stadium_name)
    if away_coordinates is None:
        print(f"Unable to find coordinates for {away_stadium_name}.")
        return None

    # Calculating the distance using the Haversine formula
    distance = geodesic(home_coordinates, away_coordinates).miles

    return distance

In [4]:
#Test code
home_stadium_name = "West Bromwich Albion Stadium Megastore"
away_stadium_name = "Turf Moor Stadium"

distance = calculate_distance(home_stadium_name, away_stadium_name)
if distance is not None:
    print(f"The distance between {home_stadium_name} and {away_stadium_name} is approximately {distance:.2f} miles.")

The distance between West Bromwich Albion Stadium Megastore and Turf Moor Stadium is approximately 89.17 miles.


## Now let's import Dataset and preprocessing data

In [None]:
df = pd.read_csv('E0-2022.csv')

df["Date"] = df["Date"].str.split("/").str[::-1].apply("/".join)
df.sort_values(by="Date", inplace=True)
df.reset_index(drop=True, inplace=True)
df["fixture"] = df.index // 10

df["HomeStadium"] = df["HomeTeam"].replace({
    'Crystal Palace' : 'Selhurst Park',
    'Fulham': 'Craven Cottage',
    'Bournemouth': 'Vitality Stadium',
    'Leeds': 'Elland Road',
    'Newcastle': "St. James's Park",
    'Tottenham': 'Tottenham Hotspur Stadium',
    'Everton': 'Goodison Park',
    'Leicester': 'King Power Stadium',
    'Man United': 'Old Trafford',
    'West Ham': 'London Stadium',
    'Brentford': 'Gtech Community Stadium',
    'Wolves': 'Molineux Stadium',
    'Southampton': "St. Mary's Stadium",
    'Arsenal': 'Emirates Stadium',
    'Brighton': 'Amex Stadium',
    'Aston Villa': 'Villa Park Reception',
    'Man City': 'Etihad Stadium',
    "Nott'm Forest": 'The Bridgford Stand',
    'Chelsea': 'Stamford Bridge',
    'Liverpool': 'Anfield',
    "Sheffield United": "Sheffield United Football Club Ticket Office",
    "West Brom":"West Bromwich Albion Stadium Megastore",
    "Cardiff": "Cardiff City Stadium",
    "Norwich": "Carrow Road Stadium",
    "Huddersfield": "Galpharm Stadium",
    "Watford":"Vicarage Road Stadium",
    "Burnley": "Turf Moor Stadium",
    "Stoke": "bet365 Stadium"
})

df["AwayStadium"] = df["AwayTeam"].replace({
    'Crystal Palace' : 'Selhurst Park',
    'Fulham': 'Craven Cottage',
    'Bournemouth': 'Vitality Stadium',
    'Leeds': 'Elland Road',
    'Newcastle': "St. James's Park",
    'Tottenham': 'Tottenham Hotspur Stadium',
    'Everton': 'Goodison Park',
    'Leicester': 'King Power Stadium',
    'Man United': 'Old Trafford',
    'West Ham': 'London Stadium',
    'Brentford': 'Gtech Community Stadium',
    'Wolves': 'Molineux Stadium',
    'Southampton': "St. Mary's Stadium",
    'Arsenal': 'Emirates Stadium',
    'Brighton': 'Amex Stadium',
    'Aston Villa': 'Villa Park Reception',
    'Man City': 'Etihad Stadium',
    "Nott'm Forest": 'The Bridgford Stand',
    'Chelsea': 'Stamford Bridge',
    'Liverpool': 'Anfield',
    "Sheffield United": "Sheffield United Football Club Ticket Office",
    "West Brom": "West Bromwich Albion Stadium Megastore",
    "Cardiff": "Cardiff City Stadium",
    "Norwich": "Carrow Road Stadium",
    "Huddersfield": "Galpharm Stadium",
    "Watford":"Vicarage Road Stadium",
    "Burnley": "Turf Moor Stadium",
    "Stoke": "bet365 Stadium"
})

df["Distance"] = df.apply(lambda row: calculate_distance(row["HomeStadium"], row["AwayStadium"]), axis=1)
df["Distance_factor"] = df["Distance"]/df["Distance"].mean()


df['Time_num'] = (df['Time'].str.replace(':', '')).astype(int)

df_new = df[["fixture", "Time","Date", "HomeTeam","AwayTeam", "FTHG", "FTAG"]].copy()

df = df[["fixture", "Date","Time_num", "HomeTeam","HomeStadium","AwayTeam","AwayStadium", "FTHG", "FTAG","Distance", "Distance_factor","B365C>2.5","B365C<2.5"]].copy()


In [None]:
df

In [None]:
plt.hist(df["Distance_factor"], density=True, bins=20, edgecolor='black')

# Add labels and title
plt.xlabel('Distance Factor')
plt.ylabel('Frequency')
plt.title('Distribution of Distance Factor')
# Show the plot
plt.show()

## Get teams names

In [None]:
import pandas as pd
from typing import List

def get_all_teams(df: pd.DataFrame) -> List:
    df = df.copy()

    all_teams = list(set(list(df["HomeTeam"].unique()) + list(df["AwayTeam"].unique())))

    return all_teams


### Prepare the parameters from the original paper

In [None]:
coefficient = {
'KAPPA': -0.4561,
'C1': 1.050,
'C2': 0.9831,
'GAMMA': 0.2958,
'XI': 0.002,
'GOAL_RANGE': range(5)
}

# Import a Bivariate Weibull Model Step by Step

In [None]:
from functools import lru_cache
@lru_cache(maxsize=None)  ## It is a technique to cache the results of function calls and avoid redundant computations

def Alph_j(x: int, j: int, c: float) -> float:
    if x == 0:
        return gamma(c * j + 1) / gamma(j + 1)
    elif j < x:
        raise ValueError(f"{x, j}")
    else:
        return sum(
            [
                Alph_j(x - 1, m, c) * gamma(c * j - c * m + 1) / gamma(j - m + 1)
                for m in range(x - 1, j)
            ]
        )

### Calculate the Weibull Count distribution

In [None]:
def weibull_first_layer(x: int, c: float, l: float, t: float = 1, j: int = 0) -> float:
    return (-1) ** (x + j) * (l * t**c) ** j * Alph_j(x, j, c) / gamma(c * j + 1)


##cumulative formula
def weibull(x: int, c: float, l: float, t: float = 1) -> float:
    return sum(
        [weibull_first_layer(x, c, l, t, j) for j in range(x, x + 50)]
    )  # 50 terms is an approximation of inf, but apparently good enough

def c_weibull(x: int, c: float, l: float, t: float = 1) -> float:
    return sum([weibull(i, c, l, t) for i in range(0, x + 1)])

### Using a Frank Copula to generate a bivariate model

In [None]:
def frank_copula(u: float, v: float, k: float = coefficient['KAPPA']) -> float:
    return -1/ k * np.log(1 + (np.exp(-k * u) - 1) * (np.exp(-k * v) - 1) / (np.exp(-k) - 1))

### Calculate The log-likelihood

In [None]:
def w_loglikelihood(
    y1: int, y2: int, l1: float, l2: float, c1=coefficient['C1'], c2=coefficient['C2']
) -> float:
    x1 = c_weibull(y1, c1, l1)
    x2 = c_weibull(y2, c2, l2)
    x3 = c_weibull(y1 - 1, c1, l1)
    x4 = c_weibull(y2 - 1, c2, l2)

    return (
        frank_copula(x1, x2) - frank_copula(x1, x4) - frank_copula(x3, x2) + frank_copula(x3, x4)
    )


## Build the Bivariate Weibull Model

In [None]:
class BivariateWeibullModel:
    def __init__(self): # This is a parameter choose by Georgi in his paper
        self.gamma = coefficient['GAMMA']

    def _get_all_teams(self, df: pd.DataFrame):
        self.teams = get_all_teams(df) ## To get all team's name
        return self


    def _get_lambda(self, a: float, b: float, home: bool): ##to calculate the team strenghth parameter Lambda
        _lambda = a + b
        if home:
            _lambda += self.gamma
        return np.exp(_lambda)

    @property
    def mapping_team(self):
        return {team: n for n, team in enumerate(self.teams)}

    @property
    def inverse_team(self):
        return {v: k for k, v in self.mapping_team.items()}  ##To record the alpha and beta coefficient of each team more convenient

    @property
    def team_strength(self):
        X = pd.DataFrame(self.C, columns=["alpha", "beta"])
        X.reset_index(inplace=True)
        X.rename(columns={"index": "team"}, inplace=True)
        X["team"] = X["team"].map(self.inverse_team)
        return X

#Overall, this code initializes the attribute `C` of the instance with a 2-dimensional array of random numbers sampled from a normal distribution, scaled down by a factor of 0.1.
    def _initialise_coefficients(self):
        self.C = 0.1 * (np.random.normal(0, 1, (len(self.teams), 2)))

    def log_likelihood(self, df: pd.DataFrame, C: np.array) -> float:
        log_l = 0

        max_fixture = df["fixture"].max()

        for i, row in df.iterrows():
            home = row["HomeTeam"]
            away = row["AwayTeam"]
            distance = row["Distance"]

            distance_factor = row["Distance_factor"]
            time_factor = row["Time_num"]

            i, j = self.mapping_team[home], self.mapping_team[away]
            l1 = self._get_lambda(C[i][0], C[j][1],  home=True)  + distance_factor/20  #+ distance*math.exp(-distance * 3)
            l2 = self._get_lambda(C[j][0], C[i][1],  home=False)


            log_l += np.log(
                np.exp(-coefficient['XI'] * (max_fixture - row["fixture"]))
                * w_loglikelihood(y1=row["FTHG"], y2=row["FTAG"], l1=l1, l2=l2)
            )

        return log_l

    def _get_gradients(self, df: pd.DataFrame, C: np.array):  #  To calculates the gradients of a log-likelihood
        C = C.copy()
        df = df.copy()
        eps = 1e-6

        gradients = np.zeros_like(C)

        for i in range(C.shape[0]):
            for j in range(C.shape[1]):
                C_plus = C.copy()
                C_minus = C.copy()
                C_plus[i, j] += eps
                C_minus[i, j] -= eps
                gradients[i, j] = (
                    self.log_likelihood(df, C_plus) - self.log_likelihood(df, C_minus) ) / (2 * eps)

        return gradients

    import time

    def fit(
        self,
        train: pd.DataFrame,
        n_iter: int = 50,
        learning_rate: float = 0.001,
        verbose: bool = True,
        test: pd.DataFrame = None,
    ):
        train = train.copy()
        self._get_all_teams(train)
        self._initialise_coefficients()

        train_log_likelihood = self.log_likelihood(train, self.C)
        if test is not None:
            test_log_likelihood = self.log_likelihood(test, self.C)

        if verbose:
            print(f"Starting train likelihood: {train_log_likelihood}")
            if test is not None:
                print(f"Starting test likelihood: {test_log_likelihood}")

        self.train_likelihoods = [train_log_likelihood]
        if test is not None:
            self.test_likelihoods = [test_log_likelihood]

        C = self.C.copy()
        for n in range(n_iter):
            C += learning_rate * self._get_gradients(train, C)
            self.C = C.copy()
            train_log_likelihood = self.log_likelihood(train, self.C)
            self.train_likelihoods.append(train_log_likelihood)
            if test is not None:
                test_log_likelihood = self.log_likelihood(test, self.C)
                self.test_likelihoods.append(test_log_likelihood)

            if verbose:
                print(f"Step {n + 1}, current likelihood: {train_log_likelihood}")
                if test is not None:
                    print(f"Step {n + 1}, current test likelihood: {test_log_likelihood}")

            if self.train_likelihoods[-1] - self.train_likelihoods[-2] < 10e-4:
                print("Algorithm has converged, we can stop our fitting here")
                break



    def predict_all_scores(self, home: str, away: str):
        results = []

        C = self.team_strength.set_index("team").to_dict(orient="index")
        for i in coefficient['GOAL_RANGE']:
            for j in coefficient['GOAL_RANGE']:
                l1 = self._get_lambda(
                    C[home]["alpha"], C[away]["beta"], home=True
                )
                l2 = self._get_lambda(
                    C[home]["beta"], C[away]["alpha"], home=False
                )
                p = w_loglikelihood(i, j, l1, l2)
                results.append([i, j, p])

        X = pd.DataFrame(results, columns=["H", "A", "p"])

        return X

    def predict_under_over_by_matches(self, home: str, away: str):
        X = self.predict_all_scores(home, away)

        X["over"] = X["H"] + X["A"] > 2.5

        under = X[~X["over"]]["p"].sum()
        over = 1 - under

        return (under, over)


    def predict_under_and_over(
        self,
        df: pd.DataFrame,
    ) -> pd.DataFrame:
        df = df.copy()

        pred = df.apply(
            lambda row: self.predict_under_over_by_matches(
                row["HomeTeam"],
                row["AwayTeam"],
            ),
            axis=1,
        ).apply(pd.Series)

        pred.rename(columns={0: "under", 1: "over"}, inplace=True)

        return pred


    def predict_result_by_matches(self, home: str, away: str):
        X = self.predict_all_scores(home, away)

        X["pred"] = "0"
        X.loc[X["H"] > X["A"], "pred"] = "1"
        X.loc[X["H"] < X["A"], "pred"] = "2"
        X.loc[X["H"] == X["A"], "pred"] = "X"

        scores = X.groupby("pred")["p"].sum()
        scores /= (
            scores.sum()
        )

        return scores.T

    def predict_1x2_goals(
        self,
        df: pd.DataFrame,
    ) -> pd.DataFrame:
        df = df.copy()

        pred = df.apply(
            lambda row: self.predict_result_by_matches(
                row["HomeTeam"],
                row["AwayTeam"],
            ),
            axis=1,
        )

        return pred


    def get_results(self):
        return pd.DataFrame(self.C).T

## Now Let's Begin with the simpler Poisson distribution-based model
### 1.See the difference in Match Begin Time

In [None]:
from scipy.stats import poisson

In [None]:
# Read the CSV file into a DataFrame
data = pd.read_csv('Data_Score.csv')
data_noon = data[(data["Time"] == "12:00") | (data["Time"] == "12:30") | (data["Time"] == "13:00") | (data["Time"] == "13:30") | (data["Time"] == "14:00") | (data["Time"] == "14:05") | (data["Time"] == "14:15")]

data_afternoon = data[(data["Time"] == "15:00") | (data["Time"] == "16:30") | (data["Time"] == "16:00") | (data["Time"] == "17:00") | (data["Time"] == "17:30") | (data["Time"] == "17:45")| (data["Time"] == "18:00")]

data_night = data[(data["Time"] == "19:00") | (data["Time"] == "19:15") | (data["Time"] == "19:30") | (data["Time"] == "19:45") | (data["Time"] == "20:00") | (data["Time"] == "20:15")]

data_avg = data[data["Time"] != "00:00"]

In [None]:
# Calculate the average number of goals scored by the home team
avg_home_goals_noon = data_noon['FTHG'].mean()

# Calculate the average number of goals scored by the away team
avg_away_goals_noon = data_noon['FTAG'].mean()

# Create a Poisson distribution for the home team goals
home_goals_dist_noon = poisson(avg_home_goals_noon)

# Create a Poisson distribution for the away team goals
away_goals_dist_noon = poisson(avg_away_goals_noon)

# Generate a list of possible goal counts (0 to 10, for example)
goal_counts = list(range(7))

# Calculate the probabilities for each goal count for the home team
home_probs_noon = home_goals_dist_noon.pmf(goal_counts)

# Calculate the probabilities for each goal count for the away team
away_probs_noon = away_goals_dist_noon.pmf(goal_counts)


#At afternoon
avg_home_goals_afternoon = data_afternoon['FTHG'].mean()
avg_away_goals_afternoon = data_afternoon['FTAG'].mean()

home_goals_dist_af = poisson(avg_home_goals_afternoon)
away_goals_dist_af = poisson(avg_away_goals_afternoon)

home_probs_af = home_goals_dist_af.pmf(goal_counts)
away_probs_af = away_goals_dist_af.pmf(goal_counts)

#At night
avg_home_goals_night = data_night['FTHG'].mean()
avg_away_goals_night = data_night['FTAG'].mean()

home_goals_dist_night = poisson(avg_home_goals_night)
away_goals_dist_night = poisson(avg_away_goals_night)

home_probs_night = home_goals_dist_night.pmf(goal_counts)
away_probs_night = away_goals_dist_night.pmf(goal_counts)

#Average
avg_home_goals = data['FTHG'].mean()
avg_away_goals = data['FTAG'].mean()

home_goals_dist = poisson(avg_home_goals)
away_goals_dist = poisson(avg_away_goals)

home_probs = home_goals_dist.pmf(goal_counts)
away_probs = away_goals_dist.pmf(goal_counts)


In [None]:
# Set the width of each bar
bar_width = 0.2

# Set the positions of the bars on the x-axis
positions = np.arange(len(goal_counts))

# Plot the bar plots
plt.bar(positions - bar_width, home_probs_noon, width=bar_width, label='Home Team at noon')
plt.bar(positions, home_probs_af, width=bar_width, label='Home Team afternoon')
plt.bar(positions + bar_width, home_probs_night, width=bar_width, label='Home Team at night')
plt.bar(positions + 2 * bar_width, home_probs, width=bar_width, label='Home Team avg')

# Add labels and title
plt.xlabel('Number of Goals')
plt.ylabel('Probability')
plt.title('Poisson Distribution of Goals (Home Team)')

# Set the x-axis tick positions and labels
plt.xticks(positions, goal_counts)

# Adjust the legend position
plt.legend(loc='upper right')

# Show the plot
plt.show()


bar_width = 0.2
positions = np.arange(len(goal_counts))

plt.bar(positions - bar_width, away_probs_noon, width=bar_width, label='Away Team at noon')
plt.bar(positions, away_probs_af, width=bar_width, label='Away Team afternoon')
plt.bar(positions + bar_width, away_probs_night, width=bar_width, label='Away Team at night')
plt.bar(positions + 2 * bar_width, away_probs, width=bar_width, label='Away Team avg')

plt.xlabel('Number of Goals')
plt.ylabel('Probability')
plt.title('Distribution of Goals (Away Team)')
plt.xticks(positions, goal_counts)
plt.legend(loc='upper right')
plt.show()


### Compare Weibull distribution and Poisson distribution

In [None]:
results_home = []

for i in goal_counts:
    l_home = 1.50
    c_home = 1.06
    p =weibull(i, c_home, l_home, t=1)
    results_home.append([i,p])

print(results_home)

x_home = [result[0] for result in results_home]
y_home = [result[1] for result in results_home]


In [None]:
results_away = []

for i in goal_counts:
    l_away = 1.10
    c_away = 0.85
    p =weibull(i, c_away, l_away, t=1)
    results_away.append([i,p])

print(results_away)

x_away = [result[0] for result in results_away]
y_away = [result[1] for result in results_away]


In [None]:
# Set the width of each bar
bar_width = 0.15

# Set the positions of the bars on the x-axis
positions = np.arange(len(goal_counts))

# Plot the bar plots
plt.bar(positions - 2 * bar_width, home_probs_noon, width=bar_width, label='Home Team at noon')
plt.bar(positions - bar_width, home_probs_af, width=bar_width, label='Home Team afternoon')
plt.bar(positions, home_probs_night, width=bar_width, label='Home Team at night')
plt.bar(positions + bar_width, home_probs, width=bar_width, label='Home Team Possion')
plt.bar(positions + 2 * bar_width, y_home, width=bar_width, label='Home Team Weibull')

# Add labels and title
plt.xlabel('Number of Goals')
plt.ylabel('Probability')
plt.title('Poisson Distribution of Goals (Home Team)')

# Set the x-axis tick positions and labels
plt.xticks(positions, goal_counts)

# Adjust the legend position
plt.legend(loc='upper right')

# Show the plot
plt.show()


bar_width = 0.15
positions = np.arange(len(goal_counts))

plt.bar(positions - 2 * bar_width, away_probs_noon, width=bar_width, label='Away Team at noon')
plt.bar(positions - bar_width, away_probs_af, width=bar_width, label='Away Team afternoon')
plt.bar(positions, away_probs_night, width=bar_width, label='Away Team at night')
plt.bar(positions + bar_width, away_probs, width=bar_width, label='Away Team Possion')
plt.bar(positions + 2 * bar_width, y_away, width=bar_width, label='Away Team Weibull')

plt.xlabel('Number of Goals')
plt.ylabel('Probability')
plt.title('Poisson Distribution of Goals (Away Team)')
plt.xticks(positions, goal_counts)
plt.legend(loc='upper right')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df, test_size=0.1)

In [None]:
w = BivariateWeibullModel()
w.fit(train, learning_rate=0.01,test = test)

In [None]:
w.team_strength.sort_values("alpha", ascending=False)

In [None]:
w.team_strength.sort_values("beta", ascending=True)

In [None]:
_test = test.copy()
TRAIN = train.join(w.predict_under_and_over(train))
TEST = test.join(w.predict_under_and_over(test))

In [None]:
TRAIN["over_2.5_true"] = TRAIN["FTHG"] + TRAIN["FTAG"] > 2.5
TRAIN["over_pred"] = (TRAIN["over"] > TRAIN["under"])
TRAIN["over_bet_pred"] = (TRAIN["B365C>2.5"] < TRAIN["B365C<2.5"])


TEST["over_2.5_true"] = TEST["FTHG"] + TEST["FTAG"] > 2.5
TEST["over_pred"] = TEST["over"] > TEST["under"]
TEST["over_bet_pred"] = (TEST["B365C>2.5"] < TEST["B365C<2.5"])

TRAIN["Kelly_fr"] = TRAIN.apply(lambda row: ((row["over"] * row["B365C>2.5"] - (1-row["over"] )) / row["B365C>2.5"])
                                    if row["over_pred"]
                                    else ((row["under"] * row["B365C<2.5"] - (1-row["under"] )) / row["B365C<2.5"]),
                                    axis=1)

TEST["Kelly_fr"] = TEST.apply(lambda row: ((row["over"] * row["B365C>2.5"] - (1-row["over"] )) / row["B365C>2.5"])
                                    if row["over_pred"]
                                    else ((row["under"] * row["B365C<2.5"] - (1-row["under"] )) / row["B365C<2.5"]),
                                    axis=1)

TRAIN["ROI"] = TRAIN.apply(lambda row: (row["Kelly_fr"] * (row["B365C>2.5"] - 1))
                                     if row["over_pred"] and row["over_bet_pred"]
                                     else (row["Kelly_fr"] * (row["B365C<2.5"] - 1))
                                     if not row["over_pred"] and not row["over_bet_pred"]
                                     else (-row["Kelly_fr"]),
                                     axis=1)

TEST["ROI"] = TEST.apply(lambda row: (row["Kelly_fr"] * (row["B365C>2.5"] - 1))
                                     if row["over_pred"] and row["over_bet_pred"]
                                     else (row["Kelly_fr"] * (row["B365C<2.5"] - 1))
                                     if not row["over_pred"] and not row["over_bet_pred"]
                                     else (-row["Kelly_fr"]),
                                     axis=1)


In [None]:
TEST_Best = TEST[TEST[["under", "over"]].max(axis=1) > 0.7]
TRAIN_Best = TRAIN[TRAIN[["under", "over"]].max(axis=1) > 0.7]

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(TRAIN["over_2.5_true"], TRAIN["over_pred"])

In [None]:
precision_recall_fscore_support(TEST["over_2.5_true"], TEST["over_pred"])

In [None]:
TEST

In [None]:
TRAIN

In [None]:
TRAIN_right = TRAIN[TRAIN["over_2.5_true"] == TRAIN["over_pred"]]

In [None]:
TRAIN_right

In [None]:
TRAIN["ROI"].mean()*100

In [None]:
(TRAIN_right["ROI"]*TRAIN_right["Kelly_fr"]).sum()

In [None]:
TRAIN["Kelly_fr"].mean()*100

In [None]:
TEST["ROI"].mean()*100

In [None]:
TRAIN_Best

In [None]:
TRAIN_Best["ROI"].mean()*100

In [None]:
precision_recall_fscore_support(TRAIN_Best["over_2.5_true"], TRAIN_Best["over_pred"])