In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from datetime import datetime
import concurrent.futures
import warnings
warnings.filterwarnings("ignore")

In [2]:
def parse_date(date_str):
    try:
        # Try parsing with the '%d/%m/%Y' format
        return pd.to_datetime(date_str, format='%d/%m/%Y')
    except ValueError:
        try:
            # Try parsing with another format, you can add more formats as needed
            return pd.to_datetime(date_str, format='%d/%m/%y')
        except:
            # Handle any other cases or invalid dates here
            return None

In [3]:
start = 2022
end = 2024

dict_countries = {
    "Spanish La Liga": "SP1", "Spanish Segunda Division": "SP2", "German Bundesliga": "D1", "German Bundesliga 2": "D2",
    "Italian Serie A": "I1", "Italian Serie B": "I2", "English Premier League": "E0", "English Championship": "E1", "English League 1": "E2",
    "English League 2": "E3", "English National": "EC", "French Ligue 1": "F1", "French Ligue 2": "F2", "Dutch Eredivisie": "N1",
    "Belgian First Division A": "B1", "Portuguese Primeira Liga": "P1", "Turkish Super League": "T1", "Greek Super League": "G1",
    "Scottish Premier League": "SC0", "Scottish League1": "SC1", "Scottish League2": "SC2", "Scottish League3": "SC3"
}

def download_data(league, code, year):
    try:
        url = f"https://www.football-data.co.uk/mmz4281/{str(year)[-2:]}{str(year+1)[-2:].zfill(2)}/{code}.csv"
        df = pd.read_csv(url, encoding='windows-1252')
    except Exception as e:
        try:
            url = f"https://www.football-data.co.uk/mmz4281/{str(year-1)[-2:]}{str(year)[-2:].zfill(2)}/{code}.csv"
            df = pd.read_csv(url, encoding='unicode_escape', sep='delimiter')
        except:
            return None
    df['season'] = year
    df['league_name'] = league  # Store the league name in a new column
    return df

dict_historical_data = {}

# Download data frames in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=5000) as executor:
    futures = []
    for league, code in dict_countries.items():
        for year in range(start, end):
            futures.append(executor.submit(download_data, league, code, year))

    for future in concurrent.futures.as_completed(futures):
        if future.result() is not None:
            league_name = future.result()['league_name'].iloc[0]
            if league_name not in dict_historical_data:
                dict_historical_data[league_name] = []
            dict_historical_data[league_name].append(future.result())

# Concatenate dataframes from different leagues
eu_leagues = pd.concat([pd.concat(frames, ignore_index=True) for frames in dict_historical_data.values()], ignore_index=True)

to_keep = ['Div', 'Date','Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA']

all_leagues = eu_leagues[to_keep].dropna(how='all')
all_leagues.to_csv('footballdata.csv')

In [4]:
# Parse the 'Date' column with inconsistent formats
all_leagues['Date'] = all_leagues['Date'].apply(parse_date)

all_leagues.to_csv('footballdata.csv', index=False)
all_leagues

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,B365H,B365D,...,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA
0,I1,2023-08-19,17:30,Empoli,Verona,0,1,A,2.25,3.20,...,1.74,-0.25,1.97,1.93,1.96,1.94,2.00,1.97,1.95,1.90
1,I1,2023-08-19,17:30,Frosinone,Napoli,1,3,A,8.50,5.00,...,2.20,1.50,1.84,2.06,1.83,2.09,1.89,2.11,1.83,2.03
2,I1,2023-08-19,19:45,Genoa,Fiorentina,1,4,A,3.40,3.25,...,1.77,0.25,2.00,1.90,2.00,1.91,2.02,1.92,1.96,1.88
3,I1,2023-08-19,19:45,Inter,Monza,2,0,H,1.40,5.00,...,2.41,-1.25,1.83,2.07,1.82,2.07,1.86,2.10,1.82,2.03
4,I1,2023-08-20,17:30,Roma,Salernitana,2,2,D,1.53,4.00,...,1.75,-1.00,1.93,1.97,1.93,1.98,1.96,2.04,1.92,1.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8981,T1,2023-06-07,18:00,Besiktas,Konyaspor,3,3,D,1.36,5.50,...,2.66,-1.50,2.05,1.75,2.07,1.85,2.12,1.85,2.03,1.79
8982,T1,2023-06-07,18:00,Fenerbahce,Gaziantep,3,0,H,,,...,,,,,,,,,,
8983,T1,2023-06-07,18:00,Giresunspor,Antalyaspor,2,0,H,1.80,3.75,...,2.36,-0.50,1.88,1.98,1.88,1.97,1.94,2.02,1.87,1.96
8984,T1,2023-06-07,18:00,Hatayspor,Galatasaray,0,3,A,,,...,,,,,,,,,,


In [5]:
df = pd.read_csv('footballdata.csv')[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG',
       'FTAG', 'FTR']].dropna(how='all')

In [6]:
fixtures_df = pd.read_csv('https://www.football-data.co.uk/fixtures.csv')
fixtures_df.dropna(how='all')

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,B1,22/09/2023,19:45,Standard,Westerlo,,,,,,...,,,,,,,,,,
1,B1,23/09/2023,15:00,Charleroi,Kortrijk,,,,,,...,,,,,,,,,,
2,B1,23/09/2023,17:15,Mechelen,Oud-Heverlee Leuven,,,,,,...,,,,,,,,,,
3,B1,23/09/2023,19:45,Antwerp,RWD Molenbeek,,,,,,...,,,,,,,,,,
4,B1,24/09/2023,12:30,Genk,St Truiden,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,T1,24/09/2023,15:00,Ankaragucu,Konyaspor,,,,,,...,,,,,,,,,,
195,T1,24/09/2023,15:00,Besiktas,Kayserispor,,,,,,...,,,,,,,,,,
196,T1,24/09/2023,18:00,Alanyaspor,Fenerbahce,,,,,,...,,,,,,,,,,
197,T1,24/09/2023,18:00,Kasimpasa,Ad. Demirspor,,,,,,...,,,,,,,,,,


In [7]:
def main():
    warnings.filterwarnings("ignore")
    # Load the data
    df = pd.read_csv('footballdata.csv').dropna()

    print( "training DC_Models")
    print("___________________________________________________________________________")

    DC_Model('1X_Model', df, pd)
    DC_Model('X2_Model', df, pd)
    DC_Model('12_Model', df, pd)    
    
    print( "training Goals Models")
    print("___________________________________________________________________________")

    Goals_model('Goals_Model_35', df, pd)
    Goals_model('Goals_Model_45', df, pd)
    
    print( "training Corners_HT_BTTS_model")
    print("___________________________________________________________________________")
    Corners_HT_BTTS_model('GG_2.5_Model', df, pd)
    
    print( "training TR_Model")
    print("___________________________________________________________________________")
    
    TR_Model('FTR_Model', df, pd)

In [8]:
def DC_Model(model_t,dataframe,pd):
    df = dataframe

    if model_t == "1X_Model":
        sTR ='HD'
    elif model_t == "X2_Model":
        sTR = 'AD'
    elif model_t == "12_Model":
        sTR = 'HA'

        
    #Create DC targets
    df['HD'] = df.apply(lambda row: '1X' if row['FTHG'] >= row['FTAG'] else ('A' if row['FTHG'] < row['FTAG'] else ''), axis=1)
    df['AD'] = df.apply(lambda row: 'X2' if row['FTHG'] <= row['FTAG'] else ('H' if row['FTHG'] > row['FTAG'] else ''), axis=1)
    df['HA'] = df.apply(lambda row: '12' if row['FTHG'] != row['FTAG'] else ('D' if row['FTHG'] == row['FTAG'] else ''), axis=1)
 

    df = dataframe

    if model_t == "1X_Model":
        Over_t = 'HD'
        Over_count_t = '1X_count'
        Under_count_t = 'H_count'
        Prob_Under_t = 'Prob_H'
        Prob_Over_t = 'Prob_1X'
        #value_t = 7
        modeltype = '1X'
        
    elif model_t == "X2_Model":
        Over_t = 'AD'
        Over_count_t = 'X2_count'
        Under_count_t = 'A_count'
        Prob_Under_t = 'Prob_A'
        Prob_Over_t = 'Prob_X2'
        #value_t = 7
        modeltype = 'X2'
        
    elif model_t == "12_Model":
        Over_t = 'HA'
        Over_count_t = '12_count'
        Under_count_t = 'D_count'
        Prob_Under_t = 'Prob_D'
        Prob_Over_t = 'Prob_12'
        #value_t = 7
        modeltype = '12'

    # Create the Over target variables
    if modeltype == '1X':
        df[Over_t] = (df.apply(lambda row: '1X' if row['FTHG'] >= row['FTAG'] else ('A' if row['FTHG'] < row['FTAG'] else ''), axis=1))
        target_map = {1: '1X', 0: 'A'}
    elif modeltype == 'X2':
        df[Over_t] = (df.apply(lambda row: 'X2' if row['FTHG'] <= row['FTAG'] else ('H' if row['FTHG'] > row['FTAG'] else ''), axis=1))
        target_map = {1: 'X2', 0: 'H'}
    elif modeltype == 'HT_Goal':
        df[Over_t] = (df.apply(lambda row: '12' if row['FTHG'] != row['FTAG'] else ('D' if row['FTHG'] == row['FTAG'] else ''), axis=1))
        target_map = {1: '12', 0: 'D'}

    #target_map = {1: 'O', 0: 'U'}
    #df[Over_t] = df[Over_t].map(target_map)

    # Preprocessing and feature engineering
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

    outcome_counts = df.groupby(['Div', 'HomeTeam', 'AwayTeam', Over_t]).size().reset_index(name='count')
    outcome_counts = outcome_counts.pivot_table(index=['Div', 'HomeTeam', 'AwayTeam'], columns=Over_t, values='count',
                                                fill_value=0).reset_index()
    outcome_counts.columns = ['Div', 'HomeTeam', 'AwayTeam', Under_count_t, Over_count_t]
    outcome_counts['avg'] = outcome_counts[[Under_count_t, Over_count_t]].mean(axis=1)
    df = pd.merge(df, outcome_counts, on=['Div', 'HomeTeam', 'AwayTeam'], how='left')

    target_mapping = {}
    if modeltype == '1X':
        target_mapping = {'1X': 1, 'A': 0}
    elif modeltype == 'X2':
        target_mapping = {'X2': 1, 'H': 0}
    elif modeltype == '12':
        target_mapping = {'12': 1, 'D': 0}

    df[Over_t] = df[Over_t].map(target_mapping)

    # Splitting data
    X = df.drop(columns=[Over_t, 'Date'])
    Y = df[Over_t]
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.4, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

    # In[5]:
    # Preprocessing pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('num2', KNNImputer(), ['avg', Under_count_t, Over_count_t]),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['Div', 'HomeTeam', 'AwayTeam'])
        ])

    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)
    X_test_preprocessed = preprocessor.transform(X_test)

    # In[6]:
    # Train the Random Forest model for Over_1.5 prediction
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_preprocessed, Y_train)

    # In[7]:
    # Validate the model and print metrics
    Y_val_pred = rf_model.predict(X_val_preprocessed)
    print("Double Chance validation/accuracy")
    print("Validation Accuracy:", accuracy_score(Y_val, Y_val_pred))
    # Test the model and print metrics
    Y_test_pred = rf_model.predict(X_test_preprocessed)
    print("Test Accuracy:", accuracy_score(Y_test, Y_test_pred))

    # Prediction function
    def predict_match(HomeTeam, AwayTeam, additional_features):
        input_data = pd.DataFrame({
            'HomeTeam': [HomeTeam],
            'AwayTeam': [AwayTeam],
            **additional_features
        })
        input_data = pd.merge(input_data, outcome_counts, on=['HomeTeam', 'AwayTeam'], how='left')
        input_data_preprocessed = preprocessor.transform(input_data)
        prediction = rf_model.predict(input_data_preprocessed)
        probabilities = rf_model.predict_proba(input_data_preprocessed)
        return prediction[0], probabilities[0]

    predictions = []
    for index, row in fixtures_df.iterrows():
        HomeTeam_fixture = row['HomeTeam']
        AwayTeam_fixture = row['AwayTeam']
        additional_features_fixture = row.drop(labels=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam']).to_dict()
        predicted_outcome_fixture, predicted_probabilities_fixture = predict_match(HomeTeam_fixture, AwayTeam_fixture,
                                                                                   additional_features_fixture)
        predictions.append(
            (HomeTeam_fixture, AwayTeam_fixture, predicted_outcome_fixture, *predicted_probabilities_fixture))

    # Save the predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions,
                                  columns=['HomeTeam', 'AwayTeam', 'PredictedOutcome', Prob_Under_t, Prob_Over_t])

    target_map = {}
    if modeltype == '1X':
        target_map = {1 :'1X', 0: 'A'}
    elif modeltype == 'X2':
        target_map = {1 :'X2', 0: 'H'}
    elif modeltype == '12':
        target_map = {1 :'12', 0: 'D'}
   
    
    predictions_df['PredictedOutcome'] = predictions_df['PredictedOutcome'].map(target_map)
    # Filter games that are likely to have Over 1.5 goals (e.g., probability threshold > 70%)

    # Assuming "HomeTeam" and "AwayTeam" are the reference columns
    columns_to_copy = ['Date', 'Time', 'Div', 'B365H', 'B365D', 'B365A']

    merged_df = predictions_df.merge(fixtures_df, on=['HomeTeam', 'AwayTeam'], how='left')

    # Update the original predictions_df with the merged columns
    predictions_df[columns_to_copy] = merged_df[columns_to_copy]
    predictions_df.head()

    # In[12]:
    predictions_df['Prob'] = predictions_df[[Prob_Under_t, Prob_Over_t]].max(axis=1)

    predictions_df = predictions_df[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob','B365H', 'B365D', 'B365A']]
    thres = 0.1
    best_games = predictions_df[(predictions_df['Prob'] >= thres)]

    # Save to CSV file
    best_games.to_csv('best_games_today(' + model_t + ').csv', index=False)

In [9]:
def Corners_HT_BTTS_model(model_t,dataframe,pd):
    df = dataframe

    if model_t == "Corners_Model_75":
        Over_t = 'Over_7.5'
        Over_count_t = 'Over_7.5_count'
        Under_count_t = 'Under_7.5_count'
        Prob_Under_t = 'Prob_Under_7.5'
        Prob_Over_t = 'Prob_Over_7.5'
        value_t = 7
        modeltype = 'Corners'
        
    elif model_t == "Corners_Model_85":
        Over_t = 'Over_8.5'
        Over_count_t = 'Over_8.5_count'
        Under_count_t = 'Under_8.5_count'
        Prob_Under_t = 'Prob_Under_8.5'
        Prob_Over_t = 'Prob_Over_8.5'
        value_t = 8
        modeltype = 'Corners'
        
    elif model_t == "Corners_Model_95":
        Over_t = 'Over_9.5'
        Over_count_t = 'Over_9.5_count'
        Under_count_t = 'Under_9.5_count'
        Prob_Under_t = 'Prob_Under_9.5'
        Prob_Over_t = 'Prob_Over_9.5'
        value_t = 9
        modeltype = 'Corners'

    elif model_t == 'GG_2.5_Model':
        Over_t = 'GG_2.5_Model'
        Over_count_t = 'GG_2.5_count'
        Under_count_t = 'NG_2.5_count'
        Prob_Under_t = 'Prob_GG_2.5'
        Prob_Over_t = 'Prob_NG_2.5'
        value_t = 0
        modeltype = 'BTTS'

    elif model_t == "HT_Goal_model_05":
        Over_t = 'Over_0.5'
        Over_count_t = 'Over_0.5_count'
        Under_count_t = 'Under_0.5_count'
        Prob_Under_t = 'Prob_Under_0.5'
        Prob_Over_t = 'Prob_Over_0.5'
        value_t = 0
        modeltype = 'HT_Goal'
        
    elif model_t == "HT_Goal_model_15":
        Over_t = 'Over_1.5'
        Over_count_t = 'Over_1.5_count'
        Under_count_t = 'Under_1.5_count'
        Prob_Under_t = 'Prob_Under_1.5'
        Prob_Over_t = 'Prob_Over_1.5'
        value_t = 1
        modeltype = 'HT_Goal'
        
    elif model_t == "HT_Goal_model_25":
        Over_t = 'Over_2.5'
        Over_count_t = 'Over_2.5_count'
        Under_count_t = 'Under_2.5_count'
        Prob_Under_t = 'Prob_Under_2.5'
        Prob_Over_t = 'Prob_Over_2.5'
        value_t = 2
        modeltype = 'HT_Goal'

    # Create the Over target variables
    if modeltype == 'Corners':
        df[Over_t] = (df['HC'] + df['AC'] > value_t).astype(int)
        target_map = {1: 'O', 0: 'U'}
    elif modeltype == 'BTTS':
        df[Over_t] = ((df['FTHG'] * df['FTAG'] > value_t) & (df['FTHG'] + df['FTAG'] > value_t + 2)).astype(int)
        target_map = {1: 'GG_25', 0: 'NG_25'}
    elif modeltype == 'HT_Goal':
        df[Over_t] = (df['HTHG'] + df['HTAG'] > value_t).astype(int)
        target_map = {1: 'O', 0: 'U'}

    #target_map = {1: 'O', 0: 'U'}
    df[Over_t] = df[Over_t].map(target_map)

    # Preprocessing and feature engineering
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

    outcome_counts = df.groupby(['Div', 'HomeTeam', 'AwayTeam', Over_t]).size().reset_index(name='count')
    outcome_counts = outcome_counts.pivot_table(index=['Div', 'HomeTeam', 'AwayTeam'], columns=Over_t, values='count',
                                                fill_value=0).reset_index()
    outcome_counts.columns = ['Div', 'HomeTeam', 'AwayTeam', Under_count_t, Over_count_t]
    outcome_counts['avg'] = outcome_counts[[Under_count_t, Over_count_t]].mean(axis=1)
    df = pd.merge(df, outcome_counts, on=['Div', 'HomeTeam', 'AwayTeam'], how='left')

    target_mapping = {}
    if modeltype == 'Corners':
        target_mapping = {'O': 1, 'U': 0}
    elif modeltype == 'BTTS':
        target_mapping = {'GG_25': 1, 'NG_25': 0}
    elif modeltype == 'HT_Goal':
        target_mapping = {'O': 1, 'U': 0}

    df[Over_t] = df[Over_t].map(target_mapping)

    # Splitting data
    X = df.drop(columns=[Over_t, 'Date'])
    Y = df[Over_t]
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.4, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

    # In[5]:
    # Preprocessing pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('num2', KNNImputer(), ['avg', Under_count_t, Over_count_t]),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['Div', 'HomeTeam', 'AwayTeam'])
        ])

    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)
    X_test_preprocessed = preprocessor.transform(X_test)

    # In[6]:
    # Train the Random Forest model for Over_1.5 prediction
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_preprocessed, Y_train)

    # In[7]:
    # Validate the model and print metrics
    Y_val_pred = rf_model.predict(X_val_preprocessed)
    print("Corners validation/accuracy")
    print("Validation Accuracy:", accuracy_score(Y_val, Y_val_pred))
    # Test the model and print metrics
    Y_test_pred = rf_model.predict(X_test_preprocessed)
    print("Test Accuracy:", accuracy_score(Y_test, Y_test_pred))

    # Prediction function
    def predict_match(HomeTeam, AwayTeam, additional_features):
        input_data = pd.DataFrame({
            'HomeTeam': [HomeTeam],
            'AwayTeam': [AwayTeam],
            **additional_features
        })
        input_data = pd.merge(input_data, outcome_counts, on=['HomeTeam', 'AwayTeam'], how='left')
        input_data_preprocessed = preprocessor.transform(input_data)
        prediction = rf_model.predict(input_data_preprocessed)
        probabilities = rf_model.predict_proba(input_data_preprocessed)
        return prediction[0], probabilities[0]

    predictions = []
    for index, row in fixtures_df.iterrows():
        HomeTeam_fixture = row['HomeTeam']
        AwayTeam_fixture = row['AwayTeam']
        additional_features_fixture = row.drop(labels=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam']).to_dict()
        predicted_outcome_fixture, predicted_probabilities_fixture = predict_match(HomeTeam_fixture, AwayTeam_fixture,
                                                                                   additional_features_fixture)
        predictions.append(
            (HomeTeam_fixture, AwayTeam_fixture, predicted_outcome_fixture, *predicted_probabilities_fixture))

    # Save the predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions,
                                  columns=['HomeTeam', 'AwayTeam', 'PredictedOutcome', Prob_Under_t, Prob_Over_t])

    predictions_df['PredictedOutcome'] = predictions_df['PredictedOutcome'].map(target_map)
    # Filter games that are likely to have Over 1.5 goals (e.g., probability threshold > 70%)

    # Assuming "HomeTeam" and "AwayTeam" are the reference columns
    columns_to_copy = ['Date', 'Time', 'Div', 'B365<2.5', 'B365>2.5']

    merged_df = predictions_df.merge(fixtures_df, on=['HomeTeam', 'AwayTeam'], how='left')

    # Update the original predictions_df with the merged columns
    predictions_df[columns_to_copy] = merged_df[columns_to_copy]
    predictions_df.head()

    # In[12]:
    predictions_df['Prob'] = predictions_df[[Prob_Under_t, Prob_Over_t]].max(axis=1)

    predictions_df['Odds'] = predictions_df.apply(
        lambda row: row['B365<2.5'] if row['PredictedOutcome'] == 'U' else row['B365>2.5'], axis=1)
    predictions_df = predictions_df[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Odds', 'Prob']]
    thres = 0.7
    best_games = predictions_df[(predictions_df['Prob'] >= thres)]

    # Save to CSV file
    best_games.to_csv('best_games_today(' + model_t + ').csv', index=False)

In [10]:
def Goals_model(model_t,dataframe,pd):

    df = dataframe
    if model_t == "Goals_Model_05":
        Over_t = 'Over_1.5'
        Over_count_t = 'Over_0.5_count'
        Under_count_t = 'Under_0.5_count'
        Prob_Under_t= 'Prob_Under_0.5'
        Prob_Over_t = 'Prob_Over_0.5'
        value_t = 0
    elif model_t == "Goals_Model_15":
        Over_t = 'Over_1.5'
        Over_count_t = 'Over_1.5_count'
        Under_count_t = 'Under_1.5_count'
        Prob_Under_t= 'Prob_Under_1.5'
        Prob_Over_t = 'Prob_Over_1.5'
        value_t = 1
    elif model_t == "Goals_Model_25":
        Over_t = 'Over_2.5'
        Over_count_t = 'Over_2.5_count'
        Under_count_t = 'Under_2.5_count'
        Prob_Under_t = 'Prob_Under_2.5'
        Prob_Over_t = 'Prob_Over_2.5'
        value_t = 2
    elif model_t == "Goals_Model_35":
        Over_t = 'Over_3.5'
        Over_count_t = 'Over_3.5_count'
        Under_count_t = 'Under_3.5_count'
        Prob_Under_t = 'Prob_Under_3.5'
        Prob_Over_t = 'Prob_Over_3.5'
        value_t = 3
    elif model_t == "Goals_Model_45":
        Over_t = 'Over_4.5'
        Over_count_t = 'Over_4.5_count'
        Under_count_t = 'Under_4.5_count'
        Prob_Under_t = 'Prob_Under_4.5'
        Prob_Over_t = 'Prob_Over_4.5'
        value_t = 4
        
    # Create the Over target variables
    df[Over_t] = (df['FTHG'] + df['FTAG'] > value_t).astype(int)
    target_map = {1: 'O', 0: 'U'}
    df[Over_t] = df[Over_t].map(target_map)

    # Preprocessing and feature engineering
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

    outcome_counts = df.groupby(['Div', 'HomeTeam', 'AwayTeam', Over_t]).size().reset_index(name='count')
    outcome_counts = outcome_counts.pivot_table(index=['Div', 'HomeTeam', 'AwayTeam'], columns=Over_t, values='count',
                                                fill_value=0).reset_index()
    outcome_counts.columns = ['Div', 'HomeTeam', 'AwayTeam', Under_count_t, Over_count_t]
    outcome_counts['avg'] = outcome_counts[[Under_count_t, Over_count_t]].mean(axis=1)
    df = pd.merge(df, outcome_counts, on=['Div', 'HomeTeam', 'AwayTeam'], how='left')
    target_mapping = {'O': 1, 'U': 0}
    df[Over_t] = df[Over_t].map(target_mapping)

    # Splitting data
    X = df.drop(columns=[Over_t, 'Date'])
    Y = df[Over_t]
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.4, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

    # In[5]:
    # Preprocessing pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('num2', KNNImputer(), ['avg', Under_count_t, Over_count_t]),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['Div', 'HomeTeam', 'AwayTeam'])
        ])
    
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)
    X_test_preprocessed = preprocessor.transform(X_test)

    # In[6]:
    # Train the Random Forest model for Over_1.5 prediction
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_preprocessed, Y_train)

    # In[7]:
    # Validate the model and print metrics
    Y_val_pred = rf_model.predict(X_val_preprocessed)
    print("Goals validation/accuracy")
    print("Validation Accuracy:", accuracy_score(Y_val, Y_val_pred))

    # Test the model and print metrics
    Y_test_pred = rf_model.predict(X_test_preprocessed)
    print("Test Accuracy:", accuracy_score(Y_test, Y_test_pred))

    # Prediction function
    def predict_match(HomeTeam, AwayTeam, additional_features):
        input_data = pd.DataFrame({
            'HomeTeam': [HomeTeam],
            'AwayTeam': [AwayTeam],
            **additional_features
        })
        input_data = pd.merge(input_data, outcome_counts, on=['HomeTeam', 'AwayTeam'], how='left')
        input_data_preprocessed = preprocessor.transform(input_data)
        prediction = rf_model.predict(input_data_preprocessed)
        probabilities = rf_model.predict_proba(input_data_preprocessed)
        return prediction[0], probabilities[0]


    predictions = []
    for index, row in fixtures_df.iterrows():
        HomeTeam_fixture = row['HomeTeam']
        AwayTeam_fixture = row['AwayTeam']
        additional_features_fixture = row.drop(labels=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam']).to_dict()
        predicted_outcome_fixture, predicted_probabilities_fixture = predict_match(HomeTeam_fixture, AwayTeam_fixture,
                                                                                   additional_features_fixture)
        predictions.append(
            (HomeTeam_fixture, AwayTeam_fixture, predicted_outcome_fixture, *predicted_probabilities_fixture))

    # Save the predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions,
                                  columns=['HomeTeam', 'AwayTeam', 'PredictedOutcome', Prob_Under_t, Prob_Over_t])

    predictions_df['PredictedOutcome'] = predictions_df['PredictedOutcome'].map(target_map)
    # Filter games that are likely to have Over 1.5 goals (e.g., probability threshold > 70%)

    # Assuming "HomeTeam" and "AwayTeam" are the reference columns
    columns_to_copy = ['Date', 'Time', 'Div', 'B365<2.5', 'B365>2.5']

    merged_df = predictions_df.merge(fixtures_df, on=['HomeTeam', 'AwayTeam'], how='left')

    # Update the original predictions_df with the merged columns
    predictions_df[columns_to_copy] = merged_df[columns_to_copy]
    predictions_df.head()

    # In[12]:
    predictions_df['Prob'] = predictions_df[[Prob_Under_t, Prob_Over_t]].max(axis=1)

    predictions_df['Odds'] = predictions_df.apply(
        lambda row: row['B365<2.5'] if row['PredictedOutcome'] == 'U' else row['B365>2.5'], axis=1)
    predictions_df = predictions_df[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Odds', 'Prob']]
    thres = 0.9
    best_games = predictions_df[(predictions_df['Prob'] >= thres)]

    # Save to CSV file
    best_games.to_csv('best_games_today(' + model_t + ').csv', index=False)


In [11]:
def TR_Model(model_t,dataframe,pd):
    df = dataframe

    if model_t == "FTR_Model":
        sTR ='FTR'
    elif model_t == "HTR_Model":
        sTR = 'HTR'
    elif model_t == "CKR_Model":
        sTR = 'CKR'

    # Load the data
    if model_t == "FTR_Model":
        df = pd.read_csv('footballdata.csv')
    else:
        df = pd.read_csv('footballdata.csv')  # [['Date','Div', 'HomeTeam', 'AwayTeam', 'HTR']]
        df['HTR'] = df.apply(lambda row: 'H' if row['HTHG'] > row['HTAG'] else ('A' if row['HTHG'] < row['HTAG'] else 'D'), axis=1)
        df['CKR'] = df.apply(lambda row: 'H' if row['HC'] > row['AC'] else ('A' if row['HC'] < row['AC'] else 'D'), axis=1)

    # Preprocessing and feature engineering
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df[df['Date'] >= '2022-01-01']
    df = df.dropna(subset=[sTR])
    df.head()

    # In[4]:

    outcome_counts = df.groupby(['Div', 'HomeTeam', 'AwayTeam', sTR]).size().reset_index(name='count')
    outcome_counts = outcome_counts.pivot_table(index=['Div', 'HomeTeam', 'AwayTeam'], columns=sTR, values='count',
                                                fill_value=0).reset_index()
    outcome_counts.columns = ['Div', 'HomeTeam', 'AwayTeam', 'A_count', 'D_count', 'H_count']
    outcome_counts['avg'] = outcome_counts[['A_count', 'D_count', 'H_count']].mean(axis=1)
    df = pd.merge(df, outcome_counts, on=['Div', 'HomeTeam', 'AwayTeam'], how='left')
    target_mapping = {'H': 1, 'D': 0, 'A': 2}
    df[sTR] = df[sTR].map(target_mapping)
    df.head()

    # In[5]:

    # Splitting data
    X = df.drop(columns=[sTR, 'Date'])
    Y = df[sTR]
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.4, random_state=42, stratify=Y)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42, stratify=Y_temp)

    # Preprocessing pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('num2', KNNImputer(), ['avg', 'A_count', 'D_count', 'H_count']),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['Div', 'HomeTeam', 'AwayTeam'])
        ])

    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Train the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_preprocessed, Y_train)

    # Validate the model and print metrics
    print("Winner validation/accuracy")
    Y_val_pred = rf_model.predict(X_val_preprocessed)
    print("Validation Accuracy:", accuracy_score(Y_val, Y_val_pred))

    # Test the model and print metrics
    Y_test_pred = rf_model.predict(X_test_preprocessed)
    print("Test Accuracy:", accuracy_score(Y_test, Y_test_pred))

    predictions = []
    for index, row in fixtures_df.iterrows():
        HomeTeam_fixture = row['HomeTeam']
        AwayTeam_fixture = row['AwayTeam']
        additional_features_fixture = row.drop(labels=['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam']).to_dict()
        predicted_outcome_fixture, predicted_probabilities_fixture = predict_match(HomeTeam_fixture, AwayTeam_fixture,
                                        additional_features_fixture,outcome_counts,preprocessor,rf_model)
        predictions.append(
            (HomeTeam_fixture, AwayTeam_fixture, predicted_outcome_fixture, predicted_probabilities_fixture))

    # Save the predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions, columns=['HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Probabilities'])
    predictions_df['Prob_H'] = predictions_df['Probabilities'].apply(lambda x: x[1])
    predictions_df['Prob_D'] = predictions_df['Probabilities'].apply(lambda x: x[0])
    predictions_df['Prob_A'] = predictions_df['Probabilities'].apply(lambda x: x[2])
    predictions_df.drop(columns=['Probabilities'], inplace=True)
    predictions_df.head()

    # In[134]:

    # Assuming "HomeTeam" and "AwayTeam" are the reference columns
    columns_to_copy = ['Date', 'Time', 'Div','B365H', 'B365D', 'B365A']

    merged_df = predictions_df.merge(fixtures_df, on=['HomeTeam', 'AwayTeam'], how='left')

    # Update the original predictions_df with the merged columns
    predictions_df[columns_to_copy] = merged_df[columns_to_copy]
    predictions_df

    # In[135]:

    from datetime import datetime
    today_date = datetime.today().strftime('%Y/%m/%d')

    # predicted_fixtures_df['Date'] = pd.to_datetime(fixtures_df['Date'], errors='coerce')
    # predictions_df = predictions_df[['Date', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob_H', 'Prob_D', 'Prob_A']]

    predictions_df['Odds'] = predictions_df.apply(lambda row: row['B365H'] if row['PredictedOutcome'] == 'H' else (
        row['B365A'] if row['PredictedOutcome'] == 'A' else row['B365D']), axis=1)
    predictions_df['Prob'] = predictions_df[['Prob_H', 'Prob_D', 'Prob_A']].max(axis=1)
    predictions_df = predictions_df[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Odds', 'Prob']]

    predictions_df


    upper_thres = 0.9
    midd_thres = 0.7
    lower_thres = 0.5

    h_fill = predictions_df['Prob']
    o_fill = predictions_df['Odds']

    # In[141]:

    best_games2 = predictions_df[(h_fill >= upper_thres)]
    best_games2.to_csv('best_games_today(' + model_t + ').csv', index=False)

In [12]:
# Prediction function
def predict_match(HomeTeam, AwayTeam, additional_features,outcome_counts,preprocessor,rf_model):
    input_data = pd.DataFrame({
        'HomeTeam': [HomeTeam],
        'AwayTeam': [AwayTeam],
        **additional_features
    })
    input_data = pd.merge(input_data, outcome_counts, on=['HomeTeam', 'AwayTeam'], how='left')
    input_data_preprocessed = preprocessor.transform(input_data)
    prediction = rf_model.predict(input_data_preprocessed)
    probabilities = rf_model.predict_proba(input_data_preprocessed)
    outcome_mapping = {1: 'H', 0: 'D', 2: 'A'}
    return outcome_mapping[prediction[0]], probabilities[0]

In [13]:
# List of column names with numeric data types
numeric_columns = all_leagues.select_dtypes(include=['number']).columns

# Convert non-numeric values in numeric columns to NaN
all_leagues[numeric_columns] = all_leagues[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Convert the float values to integers
all_leagues[numeric_columns] = all_leagues[numeric_columns].fillna(0).astype(int)

# Find and display non-numeric values in the 'B365H' column
non_numeric_values = all_leagues[~all_leagues['B365>2.5'].apply(pd.to_numeric, errors='coerce').notna()]['B365H']
print("Non-numeric values in 'B365H' column:")
print(non_numeric_values)

Non-numeric values in 'B365H' column:
Series([], Name: B365H, dtype: int32)


In [14]:
if __name__ == "__main__":
    main()
    print("Finished")

training DC_Models
___________________________________________________________________________
Double Chance validation/accuracy
Validation Accuracy: 0.9274711168164314
Test Accuracy: 0.9377406931964056
Double Chance validation/accuracy
Validation Accuracy: 0.9139922978177151
Test Accuracy: 0.9274711168164314
Double Chance validation/accuracy
Validation Accuracy: 0.926829268292683
Test Accuracy: 0.9332477535301669
training Goals Models
___________________________________________________________________________
Goals validation/accuracy
Validation Accuracy: 0.923620025673941
Test Accuracy: 0.9165596919127086
Goals validation/accuracy
Validation Accuracy: 0.9550706033376123
Test Accuracy: 0.9409499358151476
training Corners_HT_BTTS_model
___________________________________________________________________________
Corners validation/accuracy
Validation Accuracy: 0.9030808729139923
Test Accuracy: 0.9120667522464698
training TR_Model
__________________________________________________________

In [15]:
import os
import pandas as pd

# Read all CSV files in the directory containing "best_games" in their filenames
directory_path = "."  # Change this to the path of the directory containing the CSV files
best_games_dfs = []

# List all files in the directory
files = os.listdir(directory_path)

# Filter files with filenames containing "best_games" and ending with ".csv"
best_games_files = [file for file in files if "best_games" in file and file.endswith(".csv")]

# Read each CSV file, extract the tip from the filename, and store it in a list of dataframes
for file in best_games_files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path)
    
    # Extract the tip from the filename
    filename_parts = file.split("(")
    if len(filename_parts) == 2:
        tip = filename_parts[1].split(")")[0]
    else:
        tip = "Unknown_Tip"
    
    # Replace the "Source_File" column with "Tip"
    df["Tip"] = tip
    
    best_games_dfs.append(df)

# Concatenate all dataframes into a single dataframe
combined_best_games_df = pd.concat(best_games_dfs, ignore_index=True)

# Create a new DataFrame with selected columns
new_columns = ["Div", "Date", "Time", "HomeTeam", "AwayTeam", "PredictedOutcome", "Odds", "Prob", "Tip"]
new_df = combined_best_games_df[new_columns]

In [40]:
from datetime import datetime, timedelta

# Get today's date
today_date = datetime.today().strftime('%d/%m/%Y')

# Define a function to filter DataFrame based on date
def filter_by_date(df, target_date, days_offset=0):
    target_datetime = datetime.strptime(target_date, '%d/%m/%Y')
    offset_date = target_datetime + timedelta(days=days_offset)
    offset_date_str = offset_date.strftime('%d/%m/%Y')
    
    filtered_df = df[(df['Date'] == offset_date_str)]
    return filtered_df

# Filter for today's date
today_df = filter_by_date(new_df, today_date)

# Get yesterday's date
n_date = (datetime.strptime(today_date, '%d/%m/%Y') - timedelta(days=2)).strftime('%d/%m/%Y')
n_df = filter_by_date(new_df, n_date)

# Get yesterday's date
yesterday_date = (datetime.strptime(today_date, '%d/%m/%Y') - timedelta(days=1)).strftime('%d/%m/%Y')
yesterday_df = filter_by_date(new_df, yesterday_date)

# Get tomorrow's date
tomorrow_date = (datetime.strptime(today_date, '%d/%m/%Y') + timedelta(days=1)).strftime('%d/%m/%Y')
tomorrow_df = filter_by_date(new_df, tomorrow_date)

In [41]:
filtered_df = today_df.copy()
#filtered_df

divs = ['E1', 'E2', 'E3', 'EC', 'SC1', 'SC2', 'SC3', 'F2', 'D2', 'I2','SP2']
#fixtures_df = pd.read_csv('fixtures.csv')
filtered_df = filtered_df[~filtered_df['Div'].isin(divs)]
#filtered_df[['B365H', 'B365D', 'B365A']] = fixtures_df[['B365H', 'B365D', 'B365A']]

filtered_df = filtered_df.merge(fixtures_df[['HomeTeam', 'B365H', 'B365D', 'B365A']], 
                                on='HomeTeam', 
                                how='left', 
                                suffixes=('', '_fixtures'))

In [42]:
# Calculate odds for 1X, X2, and 12
filtered_df['Odds_1X'] = (1 /(1 / filtered_df['B365H'] + 1 / filtered_df['B365D'])).round(2)
filtered_df['Odds_X2'] = (1 / (1 / filtered_df['B365D'] + 1 / filtered_df['B365A'])).round(2)
filtered_df['Odds_12'] = (1 / (1 / filtered_df['B365H'] + 1 / filtered_df['B365A'])).round(2)

In [43]:
import numpy as np

# Assuming your DataFrame is named dc_df
dc_df = filtered_df[(filtered_df['Tip'] == '1X_Model')|(filtered_df['Tip'] == 'X2_Model')|(filtered_df['Tip'] == '12_Model')][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob','Tip','B365H', 'B365D', 'B365A', 'Odds_1X', 'Odds_X2',
       'Odds_12']]

# Create a new column 'Odds' based on multiple conditions
dc_df['Odds'] = np.where(dc_df['PredictedOutcome'] == '1X', dc_df['Odds_1X'],
                        np.where(dc_df['PredictedOutcome'] == 'X2', dc_df['Odds_X2'],
                                 np.where(dc_df['PredictedOutcome'] == '12', dc_df['Odds_12'],
                                          np.where(dc_df['PredictedOutcome'] == 'H', dc_df['B365H'],
                                                   np.where(dc_df['PredictedOutcome'] == 'A', dc_df['B365A'],
                                                            np.where(dc_df['PredictedOutcome'] == 'D', dc_df['B365D'], None))))))


# Select specific columns
dc_df = dc_df[['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob', 'Tip', 'Odds', 'B365H', 'B365D', 'B365A']]

In [44]:
dc1x_low = dc_df[(dc_df.PredictedOutcome=='1X')&(dc_df.Odds>=1.01)&(dc_df.Odds<=1.6)&(dc_df.B365H<=2.5)][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob', 'Odds','B365H']]
dc1x_low.nlargest(10, 'Prob')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Odds,B365H
3,25/09/2023,19:00,Panathinaikos,AEK,1X,0.97,1.34,2.3


In [45]:
dcx2_low = dc_df[(dc_df.PredictedOutcome=='X2')&(dc_df.Odds>=1.05)&(dc_df.Odds<=1.6)&(dc_df.B365A<=2.5)][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob', 'Odds','B365A']]
dcx2_low.nlargest(10, 'Prob')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Odds,B365A


In [46]:
dc12_low = dc_df[(dc_df.PredictedOutcome=='12')&(dc_df.Odds>=1.10)&(dc_df.Odds<=1.4)][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob', 'Odds','B365D']]
dc12_low.nlargest(10, 'Prob')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Odds,B365D
2,25/09/2023,18:00,Hatayspor,Trabzonspor,12,1.0,1.32,3.4
1,25/09/2023,20:15,Sp Lisbon,Rio Ave,12,0.985,1.11,6.0


In [47]:
dca = dc_df[(dc_df.PredictedOutcome=='A')&(dc_df.Odds<=1.8)&(dc_df.Prob>=0.9)][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob','Odds']]
dca.nlargest(10, 'Prob')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Odds


In [48]:
dch = dc_df[(dc_df.PredictedOutcome=='H')&(dc_df.Odds<=1.8)&(dc_df.Prob>=0.9)][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob','Odds']]
dch.nlargest(10, 'Prob')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Odds
18,25/09/2023,20:15,Sp Lisbon,Rio Ave,H,1.0,1.2


In [49]:
O35 = filtered_df[(filtered_df.Tip=='Goals_Model_35')][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob','Tip']]
O35.nlargest(10, 'Prob')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
12,25/09/2023,20:15,Sp Lisbon,Rio Ave,U,1.0,Goals_Model_35
11,25/09/2023,19:00,Panathinaikos,AEK,U,0.99,Goals_Model_35
13,25/09/2023,18:00,Hatayspor,Trabzonspor,U,0.94,Goals_Model_35


In [50]:
O45 = filtered_df[(filtered_df.Tip=='Goals_Model_45')][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob','Tip']]
O45.nlargest(10, 'Prob')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
14,25/09/2023,19:00,Panathinaikos,AEK,U,1.0,Goals_Model_45
15,25/09/2023,20:15,Sp Lisbon,Rio Ave,U,0.99,Goals_Model_45
16,25/09/2023,18:00,Hatayspor,Trabzonspor,U,0.98,Goals_Model_45


In [51]:
# htr = filtered_df[(filtered_df.Tip=='HTR_Model')]
# htr.nlargest(10, 'Prob')

In [52]:
ftr_HA_low = filtered_df[(filtered_df.Tip=='FTR_Model')&(filtered_df.PredictedOutcome!='D')&(filtered_df.Odds<=1.5)]
ftr_HA_low.nlargest(15, 'Prob')[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome',
       'Odds', 'Prob']]

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Odds,Prob
6,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,H,1.2,0.96


In [53]:
ftr_HA_mid = filtered_df[(filtered_df.Tip=='FTR_Model')&(filtered_df.PredictedOutcome!='D')&(filtered_df.Odds>1.5)&(filtered_df.Odds<=1.8)]
ftr_HA_mid.nlargest(10, 'Prob')[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome',
       'Odds', 'Prob']]

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Odds,Prob


In [54]:
ftr_HA_high = filtered_df[(filtered_df.Tip=='FTR_Model')&(filtered_df.PredictedOutcome!='D')&(filtered_df.Odds>1.8)&(filtered_df.Odds<=2.3)]
ftr_HA_high.nlargest(20, 'Prob')[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome',
       'Odds', 'Prob']]

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Odds,Prob


In [55]:
GG25 = filtered_df[(filtered_df.Tip=='GG_2.5_Model')&(filtered_df.PredictedOutcome=='GG_25')][['Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob','Tip']]
GG25.nlargest(10, 'Prob')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
10,25/09/2023,18:00,Hatayspor,Trabzonspor,GG_25,0.99,GG_2.5_Model


In [56]:
import pandas as pd

# Assuming your DataFrame is named 'df'
# Grouping by the 'HomeTeam' column and selecting the top 'n' rows within each group based on 'Prob'
n = 3  # You can change this value to specify the number of top rows you want to select
top_prob_per_home_team = filtered_df.groupby('HomeTeam').apply(lambda x: x.nlargest(n, 'Prob')).reset_index(drop=True)

# Printing the resulting DataFrame
top_prob_per_home_team[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob', 'Tip']]

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
0,T1,25/09/2023,18:00,Hatayspor,Trabzonspor,12,1.0,12_Model
1,T1,25/09/2023,18:00,Hatayspor,Trabzonspor,1X,1.0,1X_Model
2,T1,25/09/2023,18:00,Hatayspor,Trabzonspor,H,0.996667,X2_Model
3,G1,25/09/2023,19:00,Panathinaikos,AEK,U,1.0,Goals_Model_45
4,G1,25/09/2023,19:00,Panathinaikos,AEK,U,0.99,Goals_Model_35
5,G1,25/09/2023,19:00,Panathinaikos,AEK,1X,0.97,1X_Model
6,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,1X,1.0,1X_Model
7,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,NG_25,1.0,GG_2.5_Model
8,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,U,1.0,Goals_Model_35


In [57]:
import pandas as pd
cols = ['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'PredictedOutcome', 'Prob', 'Tip']
# Assuming your DataFrame is named 'df'
# First, sort the DataFrame by 'HomeTeam' and 'Prob' in descending order
df_sorted = filtered_df.sort_values(['HomeTeam', 'Prob'], ascending=[True, False])

# Create a DataFrame for the top Prob for each HomeTeam Group
top_prob_per_home_team = df_sorted.groupby('HomeTeam').first().reset_index()

# Create a DataFrame for the second-highest Prob for each HomeTeam Group
second_highest_prob_per_home_team = df_sorted.groupby('HomeTeam').nth(1).reset_index()

# Create a DataFrame for the third-highest Prob for each HomeTeam Group
third_highest_prob_per_home_team = df_sorted.groupby('HomeTeam').nth(2).reset_index()



In [58]:
# Printing the resulting DataFrames
print("Top 1 DataFrame:")
top1= top_prob_per_home_team[cols]
top1[top1.Prob>0.99]

Top 1 DataFrame:


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
0,T1,25/09/2023,18:00,Hatayspor,Trabzonspor,12,1.0,12_Model
1,G1,25/09/2023,19:00,Panathinaikos,AEK,U,1.0,Goals_Model_45
2,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,1X,1.0,1X_Model


In [59]:
# Create a DataFrame for the second-highest Prob for each HomeTeam Group
second_highest_prob_per_home_team = df_sorted.groupby('HomeTeam').nth(1).reset_index()

# Remove rows from second_highest_prob_per_home_team where both 'HomeTeam' and 'PredictedOutcome' match with top_prob_per_home_team
condition = (
    (second_highest_prob_per_home_team['HomeTeam'] == top_prob_per_home_team['HomeTeam']) &
    (second_highest_prob_per_home_team['PredictedOutcome'] == top_prob_per_home_team['PredictedOutcome'])
)

second_highest_prob_per_home_team = second_highest_prob_per_home_team[~condition]

# Printing the resulting DataFrames
print("Top 2 DataFrame:")
top2= second_highest_prob_per_home_team[cols]
top2.nlargest(10,'Prob')

Top 2 DataFrame:


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
0,T1,25/09/2023,18:00,Hatayspor,Trabzonspor,1X,1.0,1X_Model
2,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,NG_25,1.0,GG_2.5_Model


In [60]:
# Create a DataFrame for the second-highest Prob for each HomeTeam Group
third_highest_prob_per_home_team = df_sorted.groupby('HomeTeam').nth(2).reset_index()

# Remove rows from second_highest_prob_per_home_team where both 'HomeTeam' and 'PredictedOutcome' match with top_prob_per_home_team
condition = (
    (third_highest_prob_per_home_team['HomeTeam'] == top_prob_per_home_team['HomeTeam']) &
    (third_highest_prob_per_home_team['PredictedOutcome'] == top_prob_per_home_team['PredictedOutcome'])
)

third_highest_prob_per_home_team = third_highest_prob_per_home_team[~condition]

# Printing the resulting DataFrames
print("Top 3 DataFrame:")
top3= third_highest_prob_per_home_team[cols]
top3.nlargest(10,'Prob')

Top 3 DataFrame:


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
2,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,U,1.0,Goals_Model_35
0,T1,25/09/2023,18:00,Hatayspor,Trabzonspor,H,0.996667,X2_Model
1,G1,25/09/2023,19:00,Panathinaikos,AEK,1X,0.97,1X_Model


In [61]:
# Create a DataFrame for the second-highest Prob for each HomeTeam Group
fouth_highest_prob_per_home_team = df_sorted.groupby('HomeTeam').nth(3).reset_index()

# # Remove rows from second_highest_prob_per_home_team where both 'HomeTeam' and 'PredictedOutcome' match with top_prob_per_home_team
# condition = (
#     (fouth_highest_prob_per_home_team['HomeTeam'] == top_prob_per_home_team['HomeTeam']) &
#     (fouth_highest_prob_per_home_team['PredictedOutcome'] == top_prob_per_home_team['PredictedOutcome'])
# )

# fouth_highest_prob_per_home_team = fouth_highest_prob_per_home_team[~condition]

# Printing the resulting DataFrames
print("Top 4 DataFrame:")
top4= fouth_highest_prob_per_home_team[cols]
top4.nlargest(10,'Prob')

Top 4 DataFrame:


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
2,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,H,1.0,X2_Model
0,T1,25/09/2023,18:00,Hatayspor,Trabzonspor,GG_25,0.99,GG_2.5_Model
1,G1,25/09/2023,19:00,Panathinaikos,AEK,X2,0.905833,X2_Model


In [62]:
# Create a DataFrame for the second-highest Prob for each HomeTeam Group
fifth_highest_prob_per_home_team = df_sorted.groupby('HomeTeam').nth(4).reset_index()

# Printing the resulting DataFrames
print("Top 5 DataFrame:")
top5= fifth_highest_prob_per_home_team[cols]
top5.nlargest(10,'Prob')

Top 5 DataFrame:


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Prob,Tip
2,P1,25/09/2023,20:15,Sp Lisbon,Rio Ave,U,0.99,Goals_Model_45
0,T1,25/09/2023,18:00,Hatayspor,Trabzonspor,H,0.98,FTR_Model
1,G1,25/09/2023,19:00,Panathinaikos,AEK,D,0.854,12_Model


In [63]:
new_df

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,PredictedOutcome,Odds,Prob,Tip
0,B1,22/09/2023,19:45,Standard,Westerlo,12,,0.960000,12_Model
1,B1,23/09/2023,15:00,Charleroi,Kortrijk,D,,1.000000,12_Model
2,B1,23/09/2023,17:15,Mechelen,Oud-Heverlee Leuven,12,,1.000000,12_Model
3,B1,23/09/2023,19:45,Antwerp,RWD Molenbeek,12,,0.980000,12_Model
4,B1,24/09/2023,12:30,Genk,St Truiden,12,,1.000000,12_Model
...,...,...,...,...,...,...,...,...,...
1764,T1,24/09/2023,15:00,Ankaragucu,Konyaspor,X2,,1.000000,X2_Model
1765,T1,24/09/2023,15:00,Besiktas,Kayserispor,H,,1.000000,X2_Model
1766,T1,24/09/2023,18:00,Alanyaspor,Fenerbahce,X2,,1.000000,X2_Model
1767,T1,24/09/2023,18:00,Kasimpasa,Ad. Demirspor,X2,,1.000000,X2_Model
