In [10]:
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [58]:
import pandas as pd
import numpy as np

data = pd.read_csv(r'C:\Users\Theopan gerard\OneDrive\Documents\Kecerdasan Ai\data matches\SP1 (14).csv')

data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

data = data.sort_values(by='Date')

def calculate_home_performance(team):
    home_games = data[data['HomeTeam'] == team]
    home_wins = (home_games['FTR'] == 'H').sum()
    home_draws = (home_games['FTR'] == 'D').sum()
    home_losses = (home_games['FTR'] == 'A').sum()
    home_games_played = home_games.shape[0]
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    numeric_sums = home_games[numeric_columns].sum()

    result = {
        'Team': team,
        'home_wins': home_wins,
        'home_draws': home_draws,
        'home_losses': home_losses,
        'home_games_played': home_games_played
    }
    # Add numeric column sums
    for col in numeric_columns:
        result[f'home_{col}'] = numeric_sums[col]

    return result

# Function to calculate away performance for each team
def calculate_away_performance(team):
    away_games = data[data['AwayTeam'] == team]
    away_wins = (away_games['FTR'] == 'A').sum()
    away_draws = (away_games['FTR'] == 'D').sum()
    away_losses = (away_games['FTR'] == 'H').sum()
    away_games_played = away_games.shape[0]
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    numeric_sums = away_games[numeric_columns].sum()

    result = {
        'Team': team,
        'away_wins': away_wins,
        'away_draws': away_draws,
        'away_losses': away_losses,
        'away_games_played': away_games_played
    }
    # Add numeric column sums
    for col in numeric_columns:
        result[f'away_{col}'] = numeric_sums[col]

    return result

# Calculate home performance for all teams
home_teams = data['HomeTeam'].unique()
home_performance = [calculate_home_performance(team) for team in home_teams]
home_full_performance = pd.DataFrame(home_performance)

# Calculate away performance for all teams
away_teams = data['AwayTeam'].unique()
away_performance = [calculate_away_performance(team) for team in away_teams]
away_full_performance = pd.DataFrame(away_performance)

# Merge home and away performance
full_performance = pd.merge(home_full_performance, away_full_performance, on='Team', how='outer')

# Calculate additional metrics
full_performance['home_win_ratio'] = full_performance['home_wins'] / full_performance['home_games_played']
full_performance['away_win_ratio'] = full_performance['away_wins'] / full_performance['away_games_played']
full_performance['home_goals_avg'] = full_performance['home_FTHG'] / full_performance['home_games_played']
full_performance['away_goals_avg'] = full_performance['away_FTAG'] / full_performance['away_games_played']
full_performance['home_concede_avg'] = full_performance['home_FTAG'] / full_performance['home_games_played']
full_performance['away_concede_avg'] = full_performance['away_FTHG'] / full_performance['away_games_played']

# Function to get last 5 games results
def get_last_5_games(team, results_column='FTR', home_column='HomeTeam', away_column='AwayTeam', n=5):
    team_home = data[data[home_column] == team][['Date', results_column]]
    team_home['Result'] = team_home[results_column].replace({'H': 'W', 'A': 'L', 'D': 'D'})
    team_away = data[data[away_column] == team][['Date', results_column]]
    team_away['Result'] = team_away[results_column].replace({'H': 'L', 'A': 'W', 'D': 'D'})
    all_results = pd.concat([team_home, team_away]).sort_values(by='Date')
    return ''.join(all_results['Result'].tail(n)) if len(all_results) >= n else ''.join(all_results['Result'])

# Add 'last_5_games' column to the performance data
full_performance['last_5_games'] = full_performance['Team'].apply(
    lambda team: get_last_5_games(team)
)

full_performance

Unnamed: 0,Team,home_wins,home_draws,home_losses,home_games_played,home_FTHG,home_FTAG,home_HTHG,home_HTAG,home_HS,...,away_AvgCAHA,away_BFECAHH,away_BFECAHA,home_win_ratio,away_win_ratio,home_goals_avg,away_goals_avg,home_concede_avg,away_concede_avg,last_5_games
0,Alaves,3,3,2,8,9.0,9.0,3.0,5.0,86.0,...,19.69,19.69,20.24,0.375,0.1,1.125,1.2,1.125,2.1,LDDDD
1,Ath Bilbao,6,3,1,10,16.0,7.0,10.0,1.0,111.0,...,17.15,18.25,17.71,0.6,0.444444,1.6,1.444444,0.7,1.111111,WWWDW
2,Ath Madrid,7,2,0,9,19.0,6.0,4.0,4.0,136.0,...,17.25,17.99,17.88,0.777778,0.555556,2.111111,1.555556,0.666667,0.666667,WWWWW
3,Barcelona,5,0,3,8,20.0,8.0,12.0,2.0,153.0,...,20.85,22.09,21.78,0.625,0.636364,2.5,2.818182,1.0,1.272727,LWDLL
4,Betis,4,4,1,9,13.0,9.0,5.0,3.0,154.0,...,16.85,18.37,17.56,0.444444,0.222222,1.444444,0.888889,1.0,1.444444,LLDWD
5,Celta,6,2,2,10,17.0,9.0,8.0,5.0,109.0,...,15.37,16.06,15.88,0.6,0.125,1.7,1.25,0.9,2.375,DLWLW
6,Espanol,4,2,3,9,12.0,11.0,6.0,5.0,95.0,...,17.82,17.46,18.43,0.444444,0.0,1.333333,0.444444,1.222222,2.111111,WLDDL
7,Getafe,3,5,2,10,8.0,5.0,3.0,2.0,133.0,...,15.75,15.58,16.38,0.3,0.0,0.8,0.375,0.5,1.25,WLWLL
8,Girona,5,1,3,9,18.0,13.0,10.0,7.0,112.0,...,17.56,17.71,18.29,0.555556,0.222222,2.0,0.888889,1.444444,1.333333,WDLLW
9,Las Palmas,3,3,3,9,12.0,12.0,5.0,6.0,114.0,...,17.5,17.75,18.19,0.333333,0.333333,1.333333,1.222222,1.333333,1.666667,LWWDW


In [59]:
from sklearn.preprocessing import MinMaxScaler
import joblib
# Function to convert last_5_games results to points
def convert_last_5_to_points(results):
    points_map = {'W': 2, 'D': 1, 'L': 0}
    return sum(points_map[result] for result in results)

# Replace last_5_games with total points
full_performance['last_5_games'] = full_performance['last_5_games'].apply(
    lambda results: convert_last_5_to_points(results)
)

# Normalize numeric columns using MinMaxScaler
numeric_columns = full_performance.select_dtypes(include=['float64', 'int64']).columns

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Scale the numeric columns
full_performance[numeric_columns] = scaler.fit_transform(full_performance[numeric_columns])
joblib.dump(scaler,'minmax.joblib')
# Display the updated DataFrame
full_performance

Unnamed: 0,Team,home_wins,home_draws,home_losses,home_games_played,home_FTHG,home_FTAG,home_HTHG,home_HTAG,home_HS,...,away_AvgCAHA,away_BFECAHH,away_BFECAHA,home_win_ratio,away_win_ratio,home_goals_avg,away_goals_avg,home_concede_avg,away_concede_avg,last_5_games
0,Alaves,0.166667,0.6,0.4,0.0,0.166667,0.333333,0.1,0.571429,0.180723,...,0.788321,0.631336,0.738983,0.229167,0.157143,0.229167,0.337674,0.45,0.709524,0.25
1,Ath Bilbao,0.666667,0.6,0.2,1.0,0.555556,0.166667,0.8,0.0,0.481928,...,0.324818,0.410138,0.310169,0.566667,0.698413,0.466667,0.437726,0.144,0.285714,0.875
2,Ath Madrid,0.833333,0.4,0.0,0.5,0.722222,0.083333,0.2,0.428571,0.783133,...,0.343066,0.3702,0.338983,0.833333,0.873016,0.722222,0.483204,0.12,0.095238,1.0
3,Barcelona,0.5,0.0,0.6,0.0,0.777778,0.25,1.0,0.142857,0.987952,...,1.0,1.0,1.0,0.604167,1.0,0.916667,1.0,0.36,0.354978,0.125
4,Betis,0.333333,0.8,0.2,0.5,0.388889,0.333333,0.3,0.285714,1.0,...,0.270073,0.428571,0.284746,0.333333,0.349206,0.388889,0.210336,0.36,0.428571,0.25
5,Celta,0.666667,0.4,0.4,1.0,0.611111,0.333333,0.6,0.571429,0.457831,...,0.0,0.073733,0.0,0.566667,0.196429,0.516667,0.35814,0.288,0.827381,0.375
6,Espanol,0.333333,0.4,0.6,0.5,0.333333,0.5,0.4,0.571429,0.289157,...,0.44708,0.288786,0.432203,0.333333,0.0,0.333333,0.028424,0.52,0.714286,0.25
7,Getafe,0.166667,1.0,0.4,1.0,0.111111,0.0,0.1,0.142857,0.746988,...,0.069343,0.0,0.084746,0.116667,0.0,0.066667,0.0,0.0,0.345238,0.25
8,Girona,0.5,0.2,0.6,0.5,0.666667,0.666667,0.8,0.857143,0.493976,...,0.399635,0.327189,0.408475,0.5,0.349206,0.666667,0.210336,0.68,0.380952,0.375
9,Las Palmas,0.166667,0.6,0.6,0.5,0.333333,0.583333,0.3,0.714286,0.518072,...,0.388686,0.333333,0.391525,0.166667,0.52381,0.333333,0.34677,0.6,0.52381,0.625


In [60]:
full_performance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Columns: 240 entries, Team to last_5_games
dtypes: float64(239), object(1)
memory usage: 37.6+ KB


In [61]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from datetime import datetime
import numpy as np

jadwal_path = r'C:\Users\Theopan gerard\OneDrive\Documents\Kecerdasan Ai\la_liga_matches_updated.csv'

jadwal_pertandingan_df = pd.read_csv(jadwal_path)

today = datetime.now().date()
jadwal_pertandingan_df['date'] = pd.to_datetime(jadwal_pertandingan_df['date']).dt.date
jadwal_akan_datang_df = jadwal_pertandingan_df[jadwal_pertandingan_df['date'] > today]
jadwal_sudah_terjadi_df = jadwal_pertandingan_df[jadwal_pertandingan_df['date'] <= today]

home_feature = [
    'home_wins', 'home_draws', 'home_losses', 'home_FTHG', 'home_FTAG', 'home_HTHG',
    'home_HTAG', 'home_HS', 'home_win_ratio', 'home_goals_avg', 'home_concede_avg','last_5_games'
]
away_feature = [
    'away_wins', 'away_draws', 'away_losses', 'away_FTHG', 'away_FTAG', 'away_HTHG',
    'away_HTAG', 'away_HS', 'away_win_ratio', 'away_goals_avg', 'away_concede_avg','last_5_games'
]

feature_weights = {
    'home_wins': 2.0,
    'home_draws': 1.0,
    'home_losses': -2.0,
    'home_FTHG': 1.8,
    'home_FTAG': -1.5,
    'home_HTHG': 1.5,
    'home_HTAG': -1.2,
    'home_HS': 0.9,
    'home_win_ratio': 2.5,
    'home_goals_avg': 2.0,
    'home_concede_avg': -2.0,

    'away_wins': 1.8,
    'away_draws': 0.9,
    'away_losses': -1.8,
    'away_FTHG': 1.6,
    'away_FTAG': -1.4,
    'away_HTHG': 1.3,
    'away_HTAG': -1.1,
    'away_HS': 0.8,
    'away_win_ratio': 2.0,
    'away_goals_avg': 1.7,
    'away_concede_avg': -1.8
}

In [62]:
home_data = full_performance[home_feature + ['Team']].rename(columns=lambda x: x if x == 'Team' else f"{x}_home")
away_data = full_performance[away_feature + ['Team']].rename(columns=lambda x: x if x == 'Team' else f"{x}_away")

data_histori_merge = jadwal_sudah_terjadi_df.merge(home_data, left_on='Home Team', right_on='Team', how='left') \
                                              .merge(away_data, left_on='Away Team', right_on='Team', how='left', suffixes=('_home', '_away'))

# def get_match_result(row):
#     if row['home_FTHG_home'] > row['away_FTAG_away']:
#         return 'Home Win'
#     elif row['home_FTHG_home'] < row['away_FTAG_away']:
#         return 'Away Win'
#     else:
#         return 'Draw'

data.rename(columns={'HomeTeam': 'Home Team', 'AwayTeam': 'Away Team'}, inplace=True)
data_histori_merge['Match Result'] = data['FTR'].map({'H': 'Home Win', 'D': 'Draw', 'A': 'Away Win'})

# for feature, weight in feature_weights.items():
#     if f"{feature}_home" in data_histori_merge.columns:
#         data_histori_merge[f"{feature}_home"] = data_histori_merge[f"{feature}_home"] * weight
#     if f"{feature}_away" in data_histori_merge.columns:
#         data_histori_merge[f"{feature}_away"] = data_histori_merge[f"{feature}_away"] * weight

# Mengonversi kolom 'FTR' ke 'Match Result'
# data['Match Result'] = data['FTR'].map({'H': 'Home Win', 'D': 'Draw', 'A': 'Away Win'})

# # Menyesuaikan nama kolom untuk merge
# data.rename(columns={'HomeTeam': 'Home Team', 'AwayTeam': 'Away Team'}, inplace=True)

# # Gabungkan dengan data_histori_merge berdasarkan 'Home Team' dan 'Away Team'
# data_histori_merge = data[['Home Team', 'Away Team', 'Match Result']]


data_histori_merge.head(10)

Unnamed: 0,round,date,time,Home Team,Away Team,home_wins_home,home_draws_home,home_losses_home,home_FTHG_home,home_FTAG_home,...,away_FTAG_away,away_HTHG_away,away_HTAG_away,away_HS_away,away_win_ratio_away,away_goals_avg_away,away_concede_avg_away,last_5_games_away,Team_away,Match Result
0,Matchday 1,2024-08-15,19:00,Ath Bilbao,Getafe,0.666667,0.6,0.2,0.555556,0.166667,...,0.0,0.444444,0.0,0.131579,0.0,0.0,0.345238,0.25,Getafe,Draw
1,Matchday 1,2024-08-15,21:30,Betis,Girona,0.333333,0.8,0.2,0.388889,0.333333,...,0.178571,0.222222,0.363636,0.276316,0.349206,0.210336,0.380952,0.375,Girona,Draw
2,Matchday 1,2024-08-16,19:00,Celta,Alaves,0.666667,0.4,0.4,0.611111,0.333333,...,0.321429,0.333333,0.545455,0.342105,0.157143,0.337674,0.709524,0.25,Alaves,Home Win
3,Matchday 1,2024-08-16,21:30,Las Palmas,Sevilla,0.166667,0.6,0.6,0.333333,0.583333,...,0.321429,0.777778,0.545455,0.5,0.174603,0.392248,0.714286,0.375,Sevilla,Draw
4,Matchday 1,2024-08-17,19:00,Osasuna,Leganes,0.5,0.6,0.4,0.666667,0.75,...,0.214286,0.0,0.454545,0.802632,0.174603,0.255814,0.428571,0.125,Leganes,Draw
5,Matchday 1,2024-08-17,21:30,Valencia,Barcelona,0.0,0.6,0.6,0.333333,0.5,...,1.0,0.444444,1.0,0.381579,1.0,1.0,0.354978,0.125,Barcelona,Away Win
6,Matchday 1,2024-08-18,19:00,Sociedad,Vallecano,0.166667,0.4,0.8,0.166667,0.333333,...,0.178571,0.111111,0.272727,0.578947,0.471429,0.173953,0.152381,0.375,Vallecano,Away Win
7,Matchday 1,2024-08-18,21:30,Mallorca,Real Madrid,0.333333,0.6,0.6,0.166667,0.5,...,0.5,0.0,0.545455,0.092105,0.698413,0.619638,0.238095,0.625,Real Madrid,Draw
8,Matchday 1,2024-08-19,19:00,Valladolid,Espanol,0.0,0.6,0.8,0.0,0.583333,...,0.035714,1.0,0.0,0.828947,0.0,0.028424,0.714286,0.25,Espanol,Home Win
9,Matchday 1,2024-08-19,21:30,Villarreal,Ath Madrid,0.166667,0.8,0.4,0.666667,1.0,...,0.392857,0.222222,0.545455,0.460526,0.873016,0.483204,0.095238,1.0,Ath Madrid,Draw


In [63]:
features = [f'{feature}_home' for feature in home_feature] + [f'{feature}_away' for feature in away_feature]
X_train = data_histori_merge[features]
y_train = data_histori_merge['Match Result']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

model = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=42,class_weight='balanced')
model.fit(X_train, y_train_encoded)

In [90]:
features

['home_wins_home',
 'home_draws_home',
 'home_losses_home',
 'home_FTHG_home',
 'home_FTAG_home',
 'home_HTHG_home',
 'home_HTAG_home',
 'home_HS_home',
 'home_win_ratio_home',
 'home_goals_avg_home',
 'home_concede_avg_home',
 'last_5_games_home',
 'away_wins_away',
 'away_draws_away',
 'away_losses_away',
 'away_FTHG_away',
 'away_FTAG_away',
 'away_HTHG_away',
 'away_HTAG_away',
 'away_HS_away',
 'away_win_ratio_away',
 'away_goals_avg_away',
 'away_concede_avg_away',
 'last_5_games_away']

In [87]:
data_yang_akan_datang_merge = jadwal_akan_datang_df.merge(home_data, left_on='Home Team', right_on='Team', how='left') \
                                            .merge(away_data, left_on='Away Team', right_on='Team', how='left', suffixes=('_home', '_away'))


# for feature, weight in feature_weights.items():
#     if f"{feature}_home" in data_yang_akan_datang_merge.columns:
#         data_yang_akan_datang_merge[f"{feature}_home"] = data_yang_akan_datang_merge[f"{feature}_home"] * weight
#     if f"{feature}_away" in data_yang_akan_datang_merge.columns:
#         data_yang_akan_datang_merge[f"{feature}_away"] = data_yang_akan_datang_merge[f"{feature}_away"] * weight

data_yang_akan_datang_merge

Unnamed: 0,round,date,time,Home Team,Away Team,home_wins_home,home_draws_home,home_losses_home,home_FTHG_home,home_FTAG_home,...,away_FTHG_away,away_FTAG_away,away_HTHG_away,away_HTAG_away,away_HS_away,away_win_ratio_away,away_goals_avg_away,away_concede_avg_away,last_5_games_away,Team_away
0,Matchday 19,2025-01-11,,Espanol,Leganes,0.333333,0.4,0.6,0.333333,0.500000,...,0.428571,0.214286,0.000000,0.454545,0.802632,0.174603,0.255814,0.428571,0.125,Leganes
1,Matchday 19,2025-01-12,,Ath Bilbao,Real Madrid,0.666667,0.6,0.2,0.555556,0.166667,...,0.238095,0.500000,0.000000,0.545455,0.092105,0.698413,0.619638,0.238095,0.625,Real Madrid
2,Matchday 19,2025-01-12,,Ath Madrid,Osasuna,0.833333,0.4,0.0,0.722222,0.083333,...,0.428571,0.071429,0.111111,0.272727,0.276316,0.196429,0.102326,0.505952,0.250,Osasuna
3,Matchday 19,2025-01-12,,Alaves,Girona,0.166667,0.6,0.4,0.166667,0.333333,...,0.380952,0.178571,0.222222,0.363636,0.276316,0.349206,0.210336,0.380952,0.375,Girona
4,Matchday 19,2025-01-12,,Sevilla,Valencia,0.500000,0.2,0.6,0.111111,0.250000,...,0.523810,0.035714,0.555556,0.272727,0.052632,0.000000,0.028424,0.523810,0.000,Valencia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Matchday 38,2025-05-25,,Girona,Ath Madrid,0.500000,0.2,0.6,0.666667,0.666667,...,0.095238,0.392857,0.222222,0.545455,0.460526,0.873016,0.483204,0.095238,1.000,Ath Madrid
196,Matchday 38,2025-05-25,,Villarreal,Sevilla,0.166667,0.8,0.4,0.666667,1.000000,...,0.714286,0.321429,0.777778,0.545455,0.500000,0.174603,0.392248,0.714286,0.375,Sevilla
197,Matchday 38,2025-05-25,,Vallecano,Mallorca,0.000000,0.6,0.6,0.333333,0.666667,...,0.285714,0.250000,0.000000,0.181818,0.315789,0.873016,0.301292,0.285714,0.500,Mallorca
198,Matchday 38,2025-05-25,,Real Madrid,Sociedad,1.000000,0.0,0.2,1.000000,0.333333,...,0.000000,0.142857,0.111111,0.272727,0.250000,0.698413,0.164858,0.000000,0.375,Sociedad


In [65]:
X_upcoming = scaler.transform(data_yang_akan_datang_merge[features])

minimal_homewin = 0.15
minimal_draw = 0.25
minimal_awaywin = 0.15

prediksi = model.predict_proba(X_upcoming)

prediksi = np.array(prediksi)

# prediksi

prediksi[:, 0] = np.maximum(prediksi[:, 0], minimal_homewin)  # Home Win
prediksi[:, 1] = np.maximum(prediksi[:, 1], minimal_draw)  # Draw
prediksi[:, 2] = np.maximum(prediksi[:, 2], minimal_awaywin)  # Away Win

probability_sum = prediksi.sum(axis=1).reshape(-1, 1)
prediksi = prediksi / probability_sum

results = pd.DataFrame(prediksi, columns=['Away Win', 'Draw', 'Home Win'])
results['round'] = data_yang_akan_datang_merge['round']
results['Home Team'] = data_yang_akan_datang_merge['Home Team']
results['Away Team'] = data_yang_akan_datang_merge['Away Team']
results['date'] = data_yang_akan_datang_merge['date']

results['Home Win'] = (results['Home Win'] * 100).round(2).astype(str) + '%'
results['Draw'] = (results['Draw'] * 100).round(2).astype(str) + '%'
results['Away Win'] = (results['Away Win'] * 100).round(2).astype(str) + '%'
results = results[['round', 'date', 'Home Team', 'Away Team', 'Home Win', 'Draw', 'Away Win']]
results.head(10)

Unnamed: 0,round,date,Home Team,Away Team,Home Win,Draw,Away Win
0,Matchday 19,2025-01-11,Espanol,Leganes,29.95%,40.96%,29.09%
1,Matchday 19,2025-01-12,Ath Bilbao,Real Madrid,24.54%,29.56%,45.91%
2,Matchday 19,2025-01-12,Ath Madrid,Osasuna,54.53%,30.87%,14.61%
3,Matchday 19,2025-01-12,Alaves,Girona,31.1%,50.77%,18.13%
4,Matchday 19,2025-01-12,Sevilla,Valencia,40.14%,24.4%,35.47%
5,Matchday 19,2025-01-12,Vallecano,Celta,57.97%,23.38%,18.65%
6,Matchday 19,2025-01-12,Mallorca,Barcelona,16.5%,23.08%,60.42%
7,Matchday 19,2025-01-12,Sociedad,Villarreal,22.88%,33.11%,44.01%
8,Matchday 19,2025-01-12,Valladolid,Betis,25.56%,50.03%,24.41%
9,Matchday 19,2025-01-12,Las Palmas,Getafe,33.03%,34.11%,32.86%


In [None]:
prediksi_acc=model.predict(X_upcoming)
acc=accuracy_score(y)

In [11]:
hasil= results.loc[(results['Home Team']=='Real Madrid')]
hasil

Unnamed: 0,round,date,Home Team,Away Team,Home Win,Draw,Away Win
18,Matchday 20,2025-01-19,Real Madrid,Las Palmas,75.95%,14.58%,9.47%
47,Matchday 23,2025-02-09,Real Madrid,Ath Madrid,30.61%,14.26%,55.13%
66,Matchday 25,2025-02-23,Real Madrid,Girona,65.0%,14.42%,20.58%
87,Matchday 27,2025-03-09,Real Madrid,Vallecano,49.6%,21.0%,29.4%
107,Matchday 29,2025-03-30,Real Madrid,Leganes,64.22%,13.76%,22.02%
116,Matchday 30,2025-04-06,Real Madrid,Valencia,75.89%,13.3%,10.82%
136,Matchday 32,2025-04-20,Real Madrid,Ath Bilbao,68.38%,13.79%,17.83%
154,Matchday 34,2025-05-04,Real Madrid,Celta,72.52%,13.3%,14.18%
176,Matchday 36,2025-05-14,Real Madrid,Mallorca,50.89%,13.35%,35.77%
198,Matchday 38,2025-05-25,Real Madrid,Sociedad,54.2%,17.0%,28.8%


In [9]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample

# Konfigurasi Random Forest Manual
n_estimators = 500
max_depth = 5
random_state = 42

np.random.seed(random_state)

# 1. Data Preprocessing
features = [f'{feature}_home' for feature in home_feature] + [f'{feature}_away' for feature in away_feature]
X_train = data_histori_merge[features]
y_train = data_histori_merge['Match Result']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# 2. Membuat Pohon Keputusan Secara Manual
trees = []
for i in range(n_estimators):
    X_resampled, y_resampled = resample(X_train, y_train_encoded, random_state=np.random.randint(10000))
    tree = DecisionTreeClassifier(max_depth=None, class_weight='balanced', random_state=np.random.randint(10000))
    tree.fit(X_resampled, y_resampled)
    trees.append(tree)

# 3. Prediksi Data Mendatang
X_upcoming = scaler.transform(data_yang_akan_datang_merge[features])

# Prediksi Probabilitas dari Setiap Pohon
predictions = np.zeros((X_upcoming.shape[0], len(np.unique(y_train_encoded))))

for tree in trees:
    predictions += tree.predict_proba(X_upcoming)

# Rata-rata Probabilitas dari Semua Pohon
predictions /= n_estimators

# 4. Penyesuaian Probabilitas
minimal_homewin = 0.10
minimal_draw = 0.15
minimal_awaywin = 0.10


predictions[:, 0] = np.maximum(predictions[:, 0], minimal_homewin)  # Home Win
predictions[:, 1] = np.maximum(predictions[:, 1], minimal_draw)     # Draw
predictions[:, 2] = np.maximum(predictions[:, 2], minimal_awaywin)  # Away Win

# Normalisasi agar total probabilitas = 1
predictions /= predictions.sum(axis=1).reshape(-1, 1)

# 5. Membuat DataFrame Hasil
results = pd.DataFrame(predictions, columns=['Away Win', 'Draw', 'Home Win'])
results['round'] = data_yang_akan_datang_merge['round']
results['Home Team'] = data_yang_akan_datang_merge['Home Team']
results['Away Team'] = data_yang_akan_datang_merge['Away Team']
results['date'] = data_yang_akan_datang_merge['date']

results['Home Win'] = (results['Home Win'] * 100).round(2).astype(str) + '%'
results['Draw'] = (results['Draw'] * 100).round(2).astype(str) + '%'
results['Away Win'] = (results['Away Win'] * 100).round(2).astype(str) + '%'
results = results[['round', 'date', 'Home Team', 'Away Team', 'Home Win', 'Draw', 'Away Win']]

# Menampilkan 10 Hasil Teratas
results


Unnamed: 0,round,date,Home Team,Away Team,Home Win,Draw,Away Win
0,Matchday 19,2025-01-11,Espanol,Leganes,58.0%,21.8%,20.2%
1,Matchday 19,2025-01-12,Ath Bilbao,Real Madrid,26.8%,26.2%,47.0%
2,Matchday 19,2025-01-12,Ath Madrid,Osasuna,67.13%,22.91%,9.96%
3,Matchday 19,2025-01-12,Alaves,Girona,38.39%,52.25%,9.36%
4,Matchday 19,2025-01-12,Sevilla,Valencia,57.06%,14.31%,28.63%
...,...,...,...,...,...,...,...
195,Matchday 38,2025-05-25,Girona,Ath Madrid,13.4%,19.8%,66.8%
196,Matchday 38,2025-05-25,Villarreal,Sevilla,42.75%,47.83%,9.42%
197,Matchday 38,2025-05-25,Vallecano,Mallorca,35.71%,14.1%,50.19%
198,Matchday 38,2025-05-25,Real Madrid,Sociedad,54.2%,17.0%,28.8%


In [88]:
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Entropy function
def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

# Node class for decision tree
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

# DecisionTree class
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None, class_weight=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.class_weight = class_weight
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold
        return split_idx, split_thresh

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _information_gain(self, y, X_column, split_thresh):
        parent_entropy = self._weighted_entropy(y)
        left_idxs, right_idxs = self._split(X_column, split_thresh)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        e_l, e_r = self._weighted_entropy(y[left_idxs]), self._weighted_entropy(y[right_idxs])
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r
        return parent_entropy - child_entropy

    def _weighted_entropy(self, y):
        hist = np.bincount(y)
        if self.class_weight == 'balanced':
            weights = 1. / (hist + 1e-10)
            weighted_hist = hist * weights
            ps = weighted_hist / np.sum(weighted_hist)
        else:
            ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

# Bootstrap sample for RandomForest
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]

# RandomForest class
class RandomForest:
    def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, n_feats=None, random_state=None, class_weight=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.random_state = random_state
        self.class_weight = class_weight
        self.trees = []
        if self.random_state:
            np.random.seed(self.random_state)

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth,
                n_feats=self.n_feats,
                class_weight=self.class_weight
            )
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1)
        y_pred = [self._most_common_label(tree_pred) for tree_pred in tree_preds]
        return np.array(y_pred)

    def predict_proba(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        probs = np.mean(np.eye(len(np.unique(tree_preds)))[tree_preds], axis=0)
        return probs

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common


In [17]:
import joblib

In [89]:
# # Implementasi pipeline menggunakan RandomForest manual

features = [f'{feature}_home' for feature in home_feature] + [f'{feature}_away' for feature in away_feature]
X_train = data_histori_merge[features]
y_train = data_histori_merge['Match Result']

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

joblib.dump(scaler,'standard_scal_rendra.joblib')
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
joblib.dump(label_encoder,'label_rendra.joblib')
# Train a Random Forest classifier
# random_forest = RandomForest(n_trees=200, max_depth=10, random_state=42,class_weight='balanced')
random_forest = RandomForest(n_trees=20)
random_forest.fit(X_train_scaled, y_train_encoded)
joblib.dump(random_forest,'model_rendra.joblib')
# Prediction probabilities for upcoming matches
X_upcoming = scaler.transform(data_yang_akan_datang_merge[features])
prediksi = random_forest.predict_proba(X_upcoming)
prediksi = np.array(prediksi)

# Perubahan Threshold
minimal_homewin = 0.25
minimal_draw = 0.3
minimal_awaywin = 0.2

prediksi[:, 0] = np.maximum(prediksi[:, 0], minimal_homewin)  # Home Win
prediksi[:, 1] = np.maximum(prediksi[:, 1], minimal_draw)  # Draw
prediksi[:, 2] = np.maximum(prediksi[:, 2], minimal_awaywin)  # Away Win

probability_sum = prediksi.sum(axis=1).reshape(-1, 1)
prediksi = prediksi / probability_sum

results = pd.DataFrame(prediksi, columns=['Away Win', 'Draw', 'Home Win'])
results['round'] = data_yang_akan_datang_merge['round']
results['Home Team'] = data_yang_akan_datang_merge['Home Team']
results['Away Team'] = data_yang_akan_datang_merge['Away Team']
results['date'] = data_yang_akan_datang_merge['date']

results['Home Win'] = (results['Home Win'] * 100).round(2).astype(str) + '%'
results['Draw'] = (results['Draw'] * 100).round(2).astype(str) + '%'
results['Away Win'] = (results['Away Win'] * 100).round(2).astype(str) + '%'
results = results[['round', 'date', 'Home Team', 'Away Team', 'Home Win', 'Draw', 'Away Win']]
results.head(10)

Unnamed: 0,round,date,Home Team,Away Team,Home Win,Draw,Away Win
0,Matchday 19,2025-01-11,Espanol,Leganes,47.83%,30.43%,21.74%
1,Matchday 19,2025-01-12,Ath Bilbao,Real Madrid,27.27%,27.27%,45.45%
2,Matchday 19,2025-01-12,Ath Madrid,Osasuna,59.26%,22.22%,18.52%
3,Matchday 19,2025-01-12,Alaves,Girona,28.0%,52.0%,20.0%
4,Matchday 19,2025-01-12,Sevilla,Valencia,56.0%,24.0%,20.0%
5,Matchday 19,2025-01-12,Vallecano,Celta,54.17%,25.0%,20.83%
6,Matchday 19,2025-01-12,Mallorca,Barcelona,15.38%,23.08%,61.54%
7,Matchday 19,2025-01-12,Sociedad,Villarreal,30.0%,30.0%,40.0%
8,Matchday 19,2025-01-12,Valladolid,Betis,35.0%,40.0%,25.0%
9,Matchday 19,2025-01-12,Las Palmas,Getafe,23.81%,28.57%,47.62%


In [85]:
data_yang_akan_datang_merge[features]

Unnamed: 0,home_wins_home,home_draws_home,home_losses_home,home_FTHG_home,home_FTAG_home,home_HTHG_home,home_HTAG_home,home_HS_home,home_win_ratio_home,home_goals_avg_home,...,away_losses_away,away_FTHG_away,away_FTAG_away,away_HTHG_away,away_HTAG_away,away_HS_away,away_win_ratio_away,away_goals_avg_away,away_concede_avg_away,last_5_games_away
0,0.333333,0.4,0.6,0.333333,0.500000,0.4,0.571429,0.289157,0.333333,0.333333,...,0.285714,0.428571,0.214286,0.000000,0.454545,0.802632,0.174603,0.255814,0.428571,0.125
1,0.666667,0.6,0.2,0.555556,0.166667,0.8,0.000000,0.481928,0.566667,0.466667,...,0.000000,0.238095,0.500000,0.000000,0.545455,0.092105,0.698413,0.619638,0.238095,0.625
2,0.833333,0.4,0.0,0.722222,0.083333,0.2,0.428571,0.783133,0.833333,0.722222,...,0.285714,0.428571,0.071429,0.111111,0.272727,0.276316,0.196429,0.102326,0.505952,0.250
3,0.166667,0.6,0.4,0.166667,0.333333,0.1,0.571429,0.180723,0.229167,0.229167,...,0.428571,0.380952,0.178571,0.222222,0.363636,0.276316,0.349206,0.210336,0.380952,0.375
4,0.500000,0.2,0.6,0.111111,0.250000,0.2,0.285714,0.542169,0.500000,0.111111,...,0.714286,0.523810,0.035714,0.555556,0.272727,0.052632,0.000000,0.028424,0.523810,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.500000,0.2,0.6,0.666667,0.666667,0.8,0.857143,0.493976,0.500000,0.666667,...,0.000000,0.095238,0.392857,0.222222,0.545455,0.460526,0.873016,0.483204,0.095238,1.000
196,0.166667,0.8,0.4,0.666667,1.000000,0.7,1.000000,0.674699,0.166667,0.666667,...,0.571429,0.714286,0.321429,0.777778,0.545455,0.500000,0.174603,0.392248,0.714286,0.375
197,0.000000,0.6,0.6,0.333333,0.666667,0.4,0.428571,0.674699,0.041667,0.416667,...,0.428571,0.285714,0.250000,0.000000,0.181818,0.315789,0.873016,0.301292,0.285714,0.500
198,1.000000,0.0,0.2,1.000000,0.333333,0.8,0.000000,0.891566,1.000000,1.000000,...,0.285714,0.000000,0.142857,0.111111,0.272727,0.250000,0.698413,0.164858,0.000000,0.375


In [28]:
model=joblib.load(r'C:\Users\Theopan gerard\OneDrive\Documents\Kecerdasan Ai\Integrasi\model_rendra.joblib')
X_upcoming = scaler.transform(data_yang_akan_datang_merge[features])
prediksi = model.predict_proba(X_upcoming)
prediksi = np.array(prediksi)

# Perubahan Threshold
minimal_homewin = 0.25
minimal_draw = 0.3
minimal_awaywin = 0.2

prediksi[:, 0] = np.maximum(prediksi[:, 0], minimal_homewin)  # Home Win
prediksi[:, 1] = np.maximum(prediksi[:, 1], minimal_draw)  # Draw
prediksi[:, 2] = np.maximum(prediksi[:, 2], minimal_awaywin)  # Away Win

probability_sum = prediksi.sum(axis=1).reshape(-1, 1)
prediksi = prediksi / probability_sum

results = pd.DataFrame(prediksi, columns=['Away Win', 'Draw', 'Home Win'])
results['round'] = data_yang_akan_datang_merge['round']
results['Home Team'] = data_yang_akan_datang_merge['Home Team']
results['Away Team'] = data_yang_akan_datang_merge['Away Team']
results['date'] = data_yang_akan_datang_merge['date']

results['Home Win'] = (results['Home Win'] * 100).round(2).astype(str) + '%'
results['Draw'] = (results['Draw'] * 100).round(2).astype(str) + '%'
results['Away Win'] = (results['Away Win'] * 100).round(2).astype(str) + '%'
results = results[['round', 'date', 'Home Team', 'Away Team', 'Home Win', 'Draw', 'Away Win']]
results.head(10)

Unnamed: 0,round,date,Home Team,Away Team,Home Win,Draw,Away Win
0,Matchday 19,2025-01-11,Espanol,Leganes,45.45%,31.82%,22.73%
1,Matchday 19,2025-01-12,Ath Bilbao,Real Madrid,19.05%,33.33%,47.62%
2,Matchday 19,2025-01-12,Ath Madrid,Osasuna,57.69%,23.08%,19.23%
3,Matchday 19,2025-01-12,Alaves,Girona,34.78%,43.48%,21.74%
4,Matchday 19,2025-01-12,Sevilla,Valencia,50.0%,25.0%,25.0%
5,Matchday 19,2025-01-12,Vallecano,Celta,62.07%,20.69%,17.24%
6,Matchday 19,2025-01-12,Mallorca,Barcelona,14.81%,22.22%,62.96%
7,Matchday 19,2025-01-12,Sociedad,Villarreal,30.0%,35.0%,35.0%
8,Matchday 19,2025-01-12,Valladolid,Betis,45.83%,33.33%,20.83%
9,Matchday 19,2025-01-12,Las Palmas,Getafe,20.0%,35.0%,45.0%


In [14]:
hasil= results.loc[(results['Home Team']=='Real Madrid')]
hasil

Unnamed: 0,round,date,Home Team,Away Team,Home Win,Draw,Away Win
18,Matchday 20,2025-01-19,Real Madrid,Las Palmas,59.26%,22.22%,18.52%
47,Matchday 23,2025-02-09,Real Madrid,Ath Madrid,20.0%,24.0%,56.0%
66,Matchday 25,2025-02-23,Real Madrid,Girona,60.71%,21.43%,17.86%
87,Matchday 27,2025-03-09,Real Madrid,Vallecano,45.0%,30.0%,25.0%
107,Matchday 29,2025-03-30,Real Madrid,Leganes,36.0%,24.0%,40.0%
116,Matchday 30,2025-04-06,Real Madrid,Valencia,63.33%,20.0%,16.67%
136,Matchday 32,2025-04-20,Real Madrid,Ath Bilbao,62.07%,20.69%,17.24%
154,Matchday 34,2025-05-04,Real Madrid,Celta,59.26%,22.22%,18.52%
176,Matchday 36,2025-05-14,Real Madrid,Mallorca,50.0%,25.0%,25.0%
198,Matchday 38,2025-05-25,Real Madrid,Sociedad,56.0%,24.0%,20.0%


In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Assuming you have the same features
features = [f'{feature}_home' for feature in home_feature] + [f'{feature}_away' for feature in away_feature]
X_train = data_histori_merge[features]
data_histori_merge
y_home_train = data_histori_merge['home_FTHG_home']  # Target for home team score
y_away_train = data_histori_merge['home_FTAG_home']  # Target for away team score

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train a Random Forest Regressor for home team score prediction
random_forest_home = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
random_forest_home.fit(X_train_scaled, y_home_train)

# Train a Random Forest Regressor for away team score prediction
random_forest_away = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
random_forest_away.fit(X_train_scaled, y_away_train)

# Prediction for upcoming matches
X_upcoming = scaler.transform(data_yang_akan_datang_merge[features])
home_score_pred = random_forest_home.predict(X_upcoming)
away_score_pred = random_forest_away.predict(X_upcoming)

# Format the results
results = pd.DataFrame({
    'round': data_yang_akan_datang_merge['round'],
    'date': data_yang_akan_datang_merge['date'],
    'Home Team': data_yang_akan_datang_merge['Home Team'],
    'Away Team': data_yang_akan_datang_merge['Away Team'],
    'Predicted Home Score': home_score_pred.round(2),
    'Predicted Away Score': away_score_pred.round(2)
})

results
# # Show the first 10 predictions
# print(results.head(10))


Unnamed: 0,round,date,Home Team,Away Team,Predicted Home Score,Predicted Away Score
0,Matchday 19,2025-01-11,Espanol,Leganes,0.33,0.50
1,Matchday 19,2025-01-12,Ath Bilbao,Real Madrid,0.56,0.17
2,Matchday 19,2025-01-12,Ath Madrid,Osasuna,0.72,0.08
3,Matchday 19,2025-01-12,Alaves,Girona,0.17,0.33
4,Matchday 19,2025-01-12,Sevilla,Valencia,0.11,0.25
...,...,...,...,...,...,...
195,Matchday 38,2025-05-25,Girona,Ath Madrid,0.67,0.67
196,Matchday 38,2025-05-25,Villarreal,Sevilla,0.67,1.00
197,Matchday 38,2025-05-25,Vallecano,Mallorca,0.33,0.67
198,Matchday 38,2025-05-25,Real Madrid,Sociedad,1.00,0.33


In [None]:
# Prediksi untuk data pelatihan
y_train_pred = random_forest.predict(X_train_scaled)

# Menghitung akurasi
accuracy = np.mean(y_train_pred == y_train_encoded)
print(f'Akurasi model: {accuracy * 100:.2f}%')

# Prediksi untuk data uji (X_test_scaled dan y_test_encoded)
X_test_scaled = scaler.transform(X_test)  # Pastikan X_test sudah distandarisasi
y_test_pred = random_forest.predict(X_test_scaled)

# Menghitung akurasi
accuracy = np.mean(y_test_pred == y_test_encoded)
print(f'Akurasi model: {accuracy * 100:.2f}%')



In [None]:
# import numpy as np
# from collections import Counter

# # Helper function: Calculate Gini impurity
# def gini_impurity(y):
#     class_counts = Counter(y)
#     total_samples = len(y)
#     impurity = 1.0
#     for count in class_counts.values():
#         prob = count / total_samples
#         impurity -= prob ** 2
#     return impurity

# # Helper function: Create a decision tree
# def decision_tree(X, y, max_depth=100, min_samples_split=0):
#     # Base cases: if only one class is left or no data left or max depth reached
#     if len(set(y)) == 1 or len(X) <= min_samples_split or max_depth == 0:
#         return Counter(y).most_common(1)[0][0]  # Return the most common class as the leaf node

#     n_features = X.shape[1]
#     best_gini = float('inf')
#     best_split = None
#     best_left_y, best_right_y = None, None

#     # Try every feature and every possible split point
#     for feature_idx in range(n_features):
#         possible_values = set(X[:, feature_idx])
#         for value in possible_values:
#             left_mask = X[:, feature_idx] <= value
#             right_mask = ~left_mask

#             left_y = y[left_mask]
#             right_y = y[right_mask]

#             if len(left_y) == 0 or len(right_y) == 0:
#                 continue

#             gini_left = gini_impurity(left_y)
#             gini_right = gini_impurity(right_y)
#             gini = (len(left_y) / len(y)) * gini_left + (len(right_y) / len(y)) * gini_right

#             if gini < best_gini:
#                 best_gini = gini
#                 best_split = (feature_idx, value)
#                 best_left_y = left_y
#                 best_right_y = right_y

#     if best_split is None:
#         return Counter(y).most_common(1)[0][0]  # Return the most common class as the leaf node

#     left_tree = decision_tree(X[best_left_y], best_left_y, max_depth-1, min_samples_split)
#     right_tree = decision_tree(X[best_right_y], best_right_y, max_depth-1, min_samples_split)

#     return (best_split, left_tree, right_tree)

# # Helper function: Predict with a decision tree
# def predict_decision_tree(X, tree):
#     # If tree is a leaf (class label), return it
#     if not isinstance(tree, tuple):
#         return tree  # Return the class label at the leaf node

#     # Otherwise, the tree is an internal node
#     feature_idx, threshold = tree[0]
#     left_tree = tree[1]
#     right_tree = tree[2]

#     if X[feature_idx] <= threshold:
#         return predict_decision_tree(X, left_tree)
#     else:
#         return predict_decision_tree(X, right_tree)

# # Implement Random Forest from scratch
# class RandomForest:
#     def __init__(self, n_estimators=300, max_depth=15, min_samples_split=10):
#         self.n_estimators = n_estimators
#         self.max_depth = max_depth
#         self.min_samples_split = min_samples_split
#         self.trees = []

#     def fit(self, X, y):
#         for _ in range(self.n_estimators):
#             # Bootstrap sample (sampling with replacement)
#             sample_idx = np.random.choice(len(X), size=len(X), replace=True)
#             X_sample, y_sample = X[sample_idx], y[sample_idx]

#             # Train a decision tree
#             tree = decision_tree(X_sample, y_sample, self.max_depth, self.min_samples_split)
#             self.trees.append(tree)

#     def predict(self, X):
#         predictions = [self._predict_single(x) for x in X]
#         return np.array(predictions)

#     def predict_proba(self, X):
#         proba = []
#         # Get the number of unique classes in the target variable
#         n_classes = len(set([predict_decision_tree(X[0], tree) for tree in self.trees]))

#         for x in X:
#             # For each sample, get the vote from each tree and calculate probabilities
#             tree_preds = [predict_decision_tree(x, tree) for tree in self.trees]
#             # Count the votes for each class (0, 1, 2...)
#             vote_count = Counter(tree_preds)

#             # Create a list of probabilities for each class (ensure all classes are included)
#             prob = [vote_count.get(i, 0) / len(self.trees) for i in range(n_classes)]

#             proba.append(prob)

#         return np.array(proba)

#     def _predict_single(self, x):
#         tree_preds = [predict_decision_tree(x, tree) for tree in self.trees]
#         return Counter(tree_preds).most_common(1)[0][0]




In [None]:
# features = [f'{feature}_home' for feature in home_feature] + [f'{feature}_away' for feature in away_feature]
# X_train = data_histori_merge[features]
# y_train = data_histori_merge['Match Result']

# # Scaling the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)

# # Encode labels
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)

# # Train a Random Forest classifier
# random_forest = RandomForest(n_estimators=100, max_depth=5)
# random_forest.fit(X_train_scaled, y_train_encoded)

# X_upcoming = scaler.transform(data_yang_akan_datang_merge[features])

# minimal_homewin = 0.15
# minimal_draw = 0.25
# minimal_awaywin = 0.15

# # Use the RandomForest to get class probabilities
# prediksi = random_forest.predict_proba(X_upcoming)

# # # Apply minimal thresholds to the probabilities
# prediksi[:, 0] = np.maximum(prediksi[:, 0], minimal_homewin)  # Home Win
# prediksi[:, 1] = np.maximum(prediksi[:, 1], minimal_draw)  # Draw
# prediksi[:, 2] = np.maximum(prediksi[:, 2], minimal_awaywin)  # Away Win

# # Normalize probabilities so they sum to 1
# probability_sum = prediksi.sum(axis=1).reshape(-1, 1)
# prediksi = prediksi / probability_sum

# # Create the result DataFrame
# results = pd.DataFrame(prediksi, columns=['Away Win', 'Draw', 'Home Win'])
# results['round'] = data_yang_akan_datang_merge['round']
# results['Home Team'] = data_yang_akan_datang_merge['Home Team']
# results['Away Team'] = data_yang_akan_datang_merge['Away Team']
# results['date'] = data_yang_akan_datang_merge['date']

# # Convert probabilities to percentages
# results['Home Win'] = (results['Home Win'] * 100).round(2).astype(str) + '%'
# results['Draw'] = (results['Draw'] * 100).round(2).astype(str) + '%'
# results['Away Win'] = (results['Away Win'] * 100).round(2).astype(str) + '%'

# # Reorganize columns
# results = results[['round', 'date', 'Home Team', 'Away Team', 'Home Win', 'Draw', 'Away Win']]

# # Show the top 10 rows
# results.head(10)


In [None]:
# Asumsikan X_test dan y_test adalah data uji yang sudah ada
# Menggunakan model manual yang sudah dilatih (random_forest)

# Prediksi untuk data uji
prediksi_test = random_forest.predict(X_test)

# Menghitung akurasi
akurasi = np.mean(prediksi_test == y_test)

# Menampilkan hasil akurasi
print(f"Akurasi model manual: {akurasi * 100:.2f}%")


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from datetime import datetime
import numpy as np

jadwal_path = '/content/la_liga_matches.csv'
jadwal_pertandingan_df = pd.read_csv(jadwal_path)

today = datetime.now().date()
jadwal_pertandingan_df['date'] = pd.to_datetime(jadwal_pertandingan_df['date']).dt.date
jadwal_akan_datang_df = jadwal_pertandingan_df[jadwal_pertandingan_df['date'] > today]
jadwal_sudah_terjadi_df = jadwal_pertandingan_df[jadwal_pertandingan_df['date'] <= today]

home_features = [
    'home_wins', 'home_draws', 'home_losses', 'home_FTHG', 'home_FTAG', 'home_HTHG',
    'home_HTAG', 'home_HS', 'home_win_ratio', 'home_goals_avg', 'home_concede_avg'
]
away_features = [
    'away_wins', 'away_draws', 'away_losses', 'away_FTHG', 'away_FTAG', 'away_HTHG',
    'away_HTAG', 'away_HS', 'away_win_ratio', 'away_goals_avg', 'away_concede_avg'
]

home_data = full_performance[home_features + ['Team']].rename(columns=lambda x: x if x == 'Team' else f"{x}_home")
away_data = full_performance[away_features + ['Team']].rename(columns=lambda x: x if x == 'Team' else f"{x}_away")

historis_merged_data = jadwal_sudah_terjadi_df.merge(home_data, left_on='Home Team', right_on='Team', how='left') \
                                              .merge(away_data, left_on='Away Team', right_on='Team', how='left', suffixes=('_home', '_away'))
historis_merged_data = historis_merged_data.drop(columns=['Team_home', 'Team_away'])

def get_match_result(row):
    if row['home_FTHG_home'] > row['away_FTAG_away']:
        return 'Home Win'
    elif row['home_FTHG_home'] < row['away_FTAG_away']:
        return 'Away Win'
    else:
        return 'Draw'

historis_merged_data['Match Result'] = historis_merged_data.apply(get_match_result, axis=1)

feature_weights = {
    'home_wins': 1.2, 'home_draws': 0.8, 'home_losses': -1.2, 'home_FTHG': 1.1, 'home_FTAG': -1.1,
    'home_HTHG': 1.0, 'home_HTAG': -1.0, 'home_HS': 0.7, 'home_win_ratio': 1.5,
    'home_goals_avg': 1.4, 'home_concede_avg': -1.3,
    'away_wins': 1.2, 'away_draws': 0.8, 'away_losses': -1.2, 'away_FTHG': 1.1, 'away_FTAG': -1.1,
    'away_HTHG': 1.0, 'away_HTAG': -1.0, 'away_HS': 0.7, 'away_win_ratio': 1.5,
    'away_goals_avg': 1.4, 'away_concede_avg': -1.3,
}

for feature, weight in feature_weights.items():
    if f"{feature}_home" in historis_merged_data.columns:
        historis_merged_data[f"{feature}_home"] = historis_merged_data[f"{feature}_home"] * weight
    if f"{feature}_away" in historis_merged_data.columns:
        historis_merged_data[f"{feature}_away"] = historis_merged_data[f"{feature}_away"] * weight

features = [f'{feature}_home' for feature in home_features] + [f'{feature}_away' for feature in away_features]
X = historis_merged_data[features]
y = historis_merged_data['Match Result']

scaler = StandardScaler()
X = scaler.fit_transform(X)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
accuracies = []

print("Hasil K-Fold Cross-Validation:")

for train_index, test_index in kf.split(X, y_encoded):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42,
        class_weight='balanced'
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"Fold {fold} - Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, labels=np.unique(y_test)))
    fold += 1

print("\nRata-rata Akurasi:", np.mean(accuracies))


Hasil K-Fold Cross-Validation:
Fold 1 - Accuracy: 0.92
              precision    recall  f1-score   support

    Away Win       0.90      0.90      0.90        10
        Draw       0.00      0.00      0.00         1
    Home Win       0.96      0.96      0.96        25

    accuracy                           0.92        36
   macro avg       0.62      0.62      0.62        36
weighted avg       0.92      0.92      0.92        36

Fold 2 - Accuracy: 0.97
              precision    recall  f1-score   support

    Away Win       0.91      1.00      0.95        10
        Draw       0.00      0.00      0.00         1
    Home Win       1.00      1.00      1.00        25

    accuracy                           0.97        36
   macro avg       0.64      0.67      0.65        36
weighted avg       0.95      0.97      0.96        36

Fold 3 - Accuracy: 0.97
              precision    recall  f1-score   support

    Away Win       1.00      0.90      0.95        10
        Draw       1.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 4 - Accuracy: 0.83
              precision    recall  f1-score   support

    Away Win       0.69      0.82      0.75        11
        Draw       0.00      0.00      0.00         1
    Home Win       0.95      0.88      0.91        24

    accuracy                           0.83        36
   macro avg       0.55      0.56      0.55        36
weighted avg       0.85      0.83      0.84        36

Fold 5 - Accuracy: 0.92
              precision    recall  f1-score   support

    Away Win       0.90      0.90      0.90        10
        Draw       0.00      0.00      0.00         2
    Home Win       0.92      1.00      0.96        24

    accuracy                           0.92        36
   macro avg       0.61      0.63      0.62        36
weighted avg       0.87      0.92      0.89        36


Rata-rata Akurasi: 0.9222222222222223


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
