In [12]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler

## Read Train data

**Team Data And Feature Engineering**

In [8]:
train_home_team_statistics_df = pd.read_csv(r'Train_Data\train_home_team_statistics_df.csv', index_col=0)
train_away_team_statistics_df = pd.read_csv(r'Train_Data\train_away_team_statistics_df.csv', index_col=0)

# Add a shooting accuracy feature:
train_home_team_statistics_df['Shooting_Accuracy'] = train_home_team_statistics_df['TEAM_SHOTS_ON_TARGET_season_sum'] / train_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum']
train_away_team_statistics_df['Shooting_Accuracy'] = train_away_team_statistics_df['TEAM_SHOTS_ON_TARGET_season_sum'] / train_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum']

# Passing Accuracy:
train_home_team_statistics_df['Passing_Accuracy'] = train_home_team_statistics_df['TEAM_SUCCESSFUL_PASSES_season_sum'] / train_home_team_statistics_df['TEAM_PASSES_season_sum']
train_away_team_statistics_df['Passing_Accuracy'] = train_away_team_statistics_df['TEAM_SUCCESSFUL_PASSES_season_sum'] / train_away_team_statistics_df['TEAM_PASSES_season_sum']

# Save Per Game Feature:
train_home_team_statistics_df['Save_Per_Game'] = train_home_team_statistics_df['TEAM_SAVES_season_sum'] / \
    (train_home_team_statistics_df['TEAM_GAME_WON_season_sum'] + train_home_team_statistics_df['TEAM_GAME_LOST_season_sum'] + train_home_team_statistics_df['TEAM_GAME_DRAW_season_sum'])
train_away_team_statistics_df['Save_Per_Game'] = train_away_team_statistics_df['TEAM_SAVES_season_sum'] / \
      (train_away_team_statistics_df['TEAM_GAME_WON_season_sum'] + train_away_team_statistics_df['TEAM_GAME_LOST_season_sum'] + train_away_team_statistics_df['TEAM_GAME_DRAW_season_sum'])

train_home_team_statistics_df['Attacking_Intensity'] = train_home_team_statistics_df['TEAM_SHOTS_INSIDEBOX_season_sum'] + train_home_team_statistics_df['TEAM_SHOTS_OUTSIDEBOX_season_sum']
train_away_team_statistics_df['Attacking_Intensity'] = train_away_team_statistics_df['TEAM_SHOTS_INSIDEBOX_season_sum'] + train_away_team_statistics_df['TEAM_SHOTS_OUTSIDEBOX_season_sum']

# Calculate Recent Performance
train_home_team_statistics_df['RECENT_PERFORMANCE'] = train_home_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] + train_home_team_statistics_df['TEAM_GAME_DRAW_5_last_match_sum'] / (train_home_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] + train_home_team_statistics_df['TEAM_GAME_DRAW_5_last_match_sum'] + train_home_team_statistics_df['TEAM_GAME_LOST_5_last_match_sum'])
train_away_team_statistics_df['RECENT_PERFORMANCE'] = train_away_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] + train_away_team_statistics_df['TEAM_GAME_DRAW_5_last_match_sum'] / (train_away_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] + train_away_team_statistics_df['TEAM_GAME_DRAW_5_last_match_sum'] + train_away_team_statistics_df['TEAM_GAME_LOST_5_last_match_sum'])

# Calculate Team Form
train_home_team_statistics_df['TEAM_FORM'] = (train_home_team_statistics_df['TEAM_GAME_WON_season_sum'] - train_home_team_statistics_df['TEAM_GAME_LOST_season_sum']) / train_home_team_statistics_df['TEAM_GAME_WON_season_sum']
train_away_team_statistics_df['TEAM_FORM'] = (train_away_team_statistics_df['TEAM_GAME_WON_season_sum'] - train_away_team_statistics_df['TEAM_GAME_LOST_season_sum']) / train_away_team_statistics_df['TEAM_GAME_WON_season_sum']

# Defensive Strength
train_home_team_statistics_df['DEFENSIVE_STRENGTH'] = train_home_team_statistics_df['TEAM_SAVES_season_sum'] + train_home_team_statistics_df['TEAM_YELLOWCARDS_season_sum'] + train_home_team_statistics_df['TEAM_REDCARDS_season_sum']
train_away_team_statistics_df['DEFENSIVE_STRENGTH'] = train_away_team_statistics_df['TEAM_SAVES_season_sum'] + train_away_team_statistics_df['TEAM_YELLOWCARDS_season_sum'] + train_away_team_statistics_df['TEAM_REDCARDS_season_sum']

# Offensive Strength
train_home_team_statistics_df['OFFENSIVE_STRENGTH'] = train_home_team_statistics_df['TEAM_CORNERS_season_sum'] + train_home_team_statistics_df['TEAM_PENALTIES_season_sum'] + train_home_team_statistics_df['TEAM_SUBSTITUTIONS_season_sum']
train_home_team_statistics_df['OFFENSIVE_STRENGTH'] = train_home_team_statistics_df['TEAM_CORNERS_season_sum'] + train_home_team_statistics_df['TEAM_PENALTIES_season_sum'] + train_home_team_statistics_df['TEAM_SUBSTITUTIONS_season_sum']


# Recent Performance Trend
train_home_team_statistics_df['SHOTS_TREND'] = train_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - train_home_team_statistics_df['TEAM_SHOTS_TOTAL_5_last_match_sum']
train_away_team_statistics_df['SHOTS_TREND'] = train_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - train_home_team_statistics_df['TEAM_SHOTS_TOTAL_5_last_match_sum']

# Standardized Metrics (hypothetical example)
train_home_team_statistics_df['STANDARDIZED_SHOTS'] = (train_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - train_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'].mean()) / train_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'].std()
train_away_team_statistics_df['STANDARDIZED_SHOTS'] = (train_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - train_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'].mean()) / train_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'].std()

# Relative Metrics
train_home_team_statistics_df['SHOTS_DIFF'] = train_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - train_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum']


# Win/Loss Ratio
train_home_team_statistics_df['WIN_LOSS_RATIO'] = train_home_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] / train_home_team_statistics_df['TEAM_GAME_LOST_5_last_match_sum']
train_away_team_statistics_df['WIN_LOSS_RATIO'] = train_away_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] / train_away_team_statistics_df['TEAM_GAME_LOST_5_last_match_sum']


# Shots Efficiency
train_home_team_statistics_df['SHOTS_EFFICIENCY_LAST_5'] = train_home_team_statistics_df['TEAM_SHOTS_ON_TARGET_5_last_match_sum'] / train_away_team_statistics_df['TEAM_SHOTS_TOTAL_5_last_match_sum']
train_away_team_statistics_df['SHOTS_EFFICIENCY_LAST_5'] = train_away_team_statistics_df['TEAM_SHOTS_ON_TARGET_5_last_match_sum'] / train_away_team_statistics_df['TEAM_SHOTS_TOTAL_5_last_match_sum']

# Passing Accuracy in Last 5 Matches
train_home_team_statistics_df['PASS_ACCURACY_LAST_5'] = train_home_team_statistics_df['TEAM_SUCCESSFUL_PASSES_5_last_match_sum'] / train_home_team_statistics_df['TEAM_PASSES_5_last_match_sum']
train_away_team_statistics_df['PASS_ACCURACY_LAST_5'] = train_away_team_statistics_df['TEAM_SUCCESSFUL_PASSES_5_last_match_sum'] / train_away_team_statistics_df['TEAM_PASSES_5_last_match_sum']

# Defensive Strength in Last 5 Matches
train_home_team_statistics_df['DEFENSIVE_STRENGTH_LAST_5'] = train_home_team_statistics_df['TEAM_SAVES_5_last_match_sum'] + train_home_team_statistics_df['TEAM_YELLOWCARDS_5_last_match_sum'] + train_home_team_statistics_df['TEAM_REDCARDS_5_last_match_sum']
train_away_team_statistics_df['DEFENSIVE_STRENGTH_LAST_5'] = train_away_team_statistics_df['TEAM_SAVES_5_last_match_sum'] + train_away_team_statistics_df['TEAM_YELLOWCARDS_5_last_match_sum'] + train_away_team_statistics_df['TEAM_REDCARDS_5_last_match_sum']

# Offensive Strength in Last 5 Matches
train_home_team_statistics_df['OFFENSIVE_STRENGTH_LAST_5'] = train_home_team_statistics_df['TEAM_CORNERS_5_last_match_sum'] + train_home_team_statistics_df['TEAM_PENALTIES_5_last_match_sum'] + train_home_team_statistics_df['TEAM_SUBSTITUTIONS_5_last_match_sum']
train_away_team_statistics_df['OFFENSIVE_STRENGTH_LAST_5'] = train_away_team_statistics_df['TEAM_CORNERS_5_last_match_sum'] + train_away_team_statistics_df['TEAM_PENALTIES_5_last_match_sum'] + train_away_team_statistics_df['TEAM_SUBSTITUTIONS_5_last_match_sum']


train_home_team = train_home_team_statistics_df.iloc[:,2:]
train_away_team = train_away_team_statistics_df.iloc[:,2:]


**Player Data And Feature Engineering**

In [9]:
train_home_player_statistics_df = pd.read_csv(r'Train_Data\train_home_player_statistics_df.csv', index_col=0)
train_away_player_statistics_df = pd.read_csv(r'Train_Data\train_away_player_statistics_df.csv', index_col=0)

train_home_player = train_home_player_statistics_df.iloc[:,4:]
train_away_player = train_away_player_statistics_df.iloc[:,4:] 


In [10]:
def create_feature_player(X_train):
    # Player Form
    X_train['PLAYER_FORM_LAST_5'] = X_train['PLAYER_GOALS_5_last_match_sum'] + X_train['PLAYER_ASSISTS_5_last_match_sum']

    # Defensive Contribution
    X_train['PLAYER_DEFENSIVE_CONTRIBUTION_LAST_5'] = (X_train['PLAYER_INTERCEPTIONS_5_last_match_sum'] +
                                                    X_train['PLAYER_TACKLES_5_last_match_sum'] +
                                                    X_train['PLAYER_CLEARANCES_5_last_match_sum'])

    # Attacking Threat
    X_train['PLAYER_ATTACKING_THREAT_LAST_5'] = (X_train['PLAYER_SHOTS_ON_TARGET_5_last_match_sum'] +
                                                X_train['PLAYER_KEY_PASSES_5_last_match_sum'])

    # Discipline
    X_train['PLAYER_DISCIPLINE_LAST_5'] = (X_train['PLAYER_YELLOWCARDS_5_last_match_sum'] +
                                        X_train['PLAYER_REDCARDS_5_last_match_sum'])
    return X_train

train_home_player = create_feature_player(train_home_player.replace({np.nan: 0}))
train_away_player = create_feature_player(train_away_player.replace({np.nan: 0}))

col = ['PLAYER_FORM_LAST_5', 'PLAYER_DEFENSIVE_CONTRIBUTION_LAST_5', 'PLAYER_ATTACKING_THREAT_LAST_5', 'PLAYER_DISCIPLINE_LAST_5']

def add_feature_player(df, target_df):
    for c in df.columns:
        # First, calculate the total goals for each team
        moyenne = df.groupby(df.index)[c].mean()
        
        var = df.groupby(df.index)[c].std()

        target_df[c] = moyenne
        # target_df[f'{c}_std'] = var
    return target_df


train_home_team = add_feature_player(train_home_player.replace({np.nan: 0}), train_home_team)
train_away_team = add_feature_player(train_away_player.replace({np.nan: 0}), train_away_team)


train_home_team.columns = 'HOME_' + train_home_team.columns
train_away_team.columns = 'AWAY_' + train_away_team.columns


In [96]:

train_scores = pd.read_csv(r'Y_train_1rknArQ.csv', index_col=0)


train_data =  pd.concat([train_home_team,train_away_team],join='inner',axis=1)
train_scores = train_scores.loc[train_data.index]

train_data = train_data.replace({np.inf:np.nan,-np.inf:np.nan})

'''scaler = MinMaxScaler()

train_data = scaler.fit_transform(train_data)
'''

'scaler = MinMaxScaler()\n\ntrain_data = scaler.fit_transform(train_data)\n'

In [93]:
train_data = pd.DataFrame(train_data)

In [28]:
labels = []
encoding = {'HOME_WINS': 1,
            'DRAW': 0,
            'AWAY_WINS': -1}
for idx, row in train_scores.iterrows():
    if row['HOME_WINS'] == 1:
        labels.append(1)
    elif row['DRAW'] == 1:
        labels.append(0)
    else:
        labels.append(2)

labels = pd.DataFrame(labels)

# Random Forest Classifier: 

In [103]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Assuming train_data is your training data and labels are your class labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
scaler.fit(X_train.replace({np.nan: 0}))
X_train = scaler.transform(X_train.replace({np.nan: 0}))
X_test = scaler.transform(X_test.replace({np.nan: 0}))

# Initialize the SVC classifier
svm_classifier = SVC(kernel='rbf', C=0.245)  # You can also use other kernels like 'rbf', 'poly', etc.

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.49939049167005284


In [109]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, labels, train_size=0.8, random_state=42)
#X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, train_size=0.8, random_state=42)

scaler = MinMaxScaler()
scaler.fit(X_train.replace({np.nan: 0}))
X_train = scaler.transform(X_train.replace({np.nan: 0}))
X_test = scaler.transform(X_test.replace({np.nan: 0}))

In [125]:
clf = RandomForestClassifier(n_estimators=73, max_depth=8, criterion="gini", min_samples_split=26, random_state=42)
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
np.round(accuracy_score(prediction, y_test),4)

0.4982

In [127]:
clf = RandomForestClassifier(n_estimators=79, max_depth=9, criterion="gini", min_samples_split=26, random_state=42)
clf.fit(X_train.replace({np.nan:0}), y_train)
prediction = clf.predict(X_test.replace({np.nan: 0}))
np.round(accuracy_score(prediction, y_test),4)

0.4953

In [129]:
from sklearn.model_selection import cross_val_score

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators=79, max_depth=9, criterion="gini", min_samples_split=26, random_state=42)

# Train-test split 
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, labels, train_size=0.8, random_state=42)
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, train_size=0.8, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(clf, train_data.replace({np.nan: 0}), labels, cv=10)  # 5-fold cross-validation

# Fit the model
clf.fit(X_train.replace({np.nan: 0}), y_train)

# Make predictions on the test set
prediction = clf.predict(X_test.replace({np.nan: 0}))

# Calculate accuracy
test_accuracy = accuracy_score(prediction, y_test)

# Print cross-validation scores and test accuracy
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

print("Test accuracy:", np.round(test_accuracy, 4))


Cross-validation scores: [0.49634444 0.48822096 0.48822096 0.48780488 0.48617886 0.49837398
 0.4902439  0.49593496 0.51138211 0.48292683]
Mean CV accuracy: 0.49256318810141797
Test accuracy: 0.4953


In [61]:
from sklearn.model_selection import cross_val_score

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators=75, max_depth=7, criterion="gini", min_samples_split=26, random_state=42)
# Train-test split 
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, labels, train_size=0.8, random_state=42)
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, train_size=0.8, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(clf, train_data.replace({np.nan: 0}), labels, cv=10)  # 5-fold cross-validation

# Fit the model
clf.fit(X_train.replace({np.nan: 0}), y_train)

# Make predictions on the test set
prediction = clf.predict(X_test.replace({np.nan: 0}))

# Calculate accuracy
test_accuracy = accuracy_score(prediction, y_test)

# Print cross-validation scores and test accuracy
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

print("Test accuracy:", np.round(test_accuracy, 4))


Cross-validation scores: [0.49147035 0.47603574 0.49553209 0.49268293 0.48780488 0.49105691
 0.49674797 0.49512195 0.51788618 0.48861789]
Mean CV accuracy: 0.49329568795281775
Test accuracy: 0.4965


In [162]:
from sklearn.model_selection import cross_val_score

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators=79, max_depth=10, criterion="gini", min_samples_split=25, random_state=42)

# Train-test split 
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, labels, train_size=0.8, random_state=42)
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, train_size=0.8, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(clf, train_data.replace({np.nan: 0}), labels, cv=4)  # 5-fold cross-validation

# Fit the model
clf.fit(X_train.replace({np.nan: 0}), y_train)

# Make predictions on the test set
prediction = clf.predict(X_test.replace({np.nan: 0}))

# Calculate accuracy
test_accuracy = accuracy_score(prediction, y_test)

# Print cross-validation scores and test accuracy
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))
print("Test accuracy:", np.round(test_accuracy, 4))

Cross-validation scores: [0.48342003 0.48992198 0.49642393 0.50081301]
Mean CV accuracy: 0.49264473447725377
Test accuracy: 0.497


In [113]:
from sklearn.model_selection import cross_val_score

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators=60, max_depth=8, criterion="gini", min_samples_split=18, random_state=42)

# Train-test split 
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, labels, train_size=0.8, random_state=42)
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, train_size=0.8, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(clf, train_data.replace({np.nan: 0}), labels, cv=4)  # 5-fold cross-validation

# Fit the model
clf.fit(X_train.replace({np.nan: 0}), y_train)

# Make predictions on the test set
prediction = clf.predict(X_test.replace({np.nan: 0}))

# Calculate accuracy
test_accuracy = accuracy_score(prediction, y_test)

# Print cross-validation scores and test accuracy
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))
print("Test accuracy:", np.round(test_accuracy, 4))


Cross-validation scores: [0.48244473 0.48862159 0.49479844 0.49138211]
Mean CV accuracy: 0.4893117183122417
Test accuracy: 0.4978


In [132]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming you have your data loaded into X and y

# Step 1: Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(train_data, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Step 2: Train RF and XGBoost Models Independently

rf_model = RandomForestClassifier(n_estimators=79, max_depth=11, criterion="gini", min_samples_split=25, random_state=42)
xgb_model = xgb.XGBClassifier(n_estimators=15, eta=0, booster='gbtree',
                         tree_method='hist', max_depth=4, learning_rate=0.09, objective='multi:softprob',
                            num_class=3, eval_metric='mlogloss', random_state=42)

rf_model.fit(X_train.replace({np.nan: 0}), y_train)
xgb_model.fit(X_train.replace({np.nan: 0}), y_train)

# Step 3: Make Predictions on the Validation Set
rf_pred_val = rf_model.predict(X_val.replace({np.nan: 0}))
xgb_pred_val = xgb_model.predict(X_val.replace({np.nan: 0}))

# Step 4: Stacking/Blending
stacked_features_val = np.column_stack((rf_pred_val, xgb_pred_val))

# Step 5: Train Meta-Model
meta_model = RandomForestClassifier(n_estimators=8, max_depth=2, min_samples_split=3, random_state=42)
meta_model.fit(stacked_features_val, y_val)

# Step 6: Make Final Predictions on the Test Set
rf_pred_test = rf_model.predict(X_test.replace({np.nan: 0}))
xgb_pred_test = xgb_model.predict(X_test.replace({np.nan: 0}))
stacked_features_test = np.column_stack((rf_pred_test, xgb_pred_test))
final_predictions = meta_model.predict(stacked_features_test)

# Evaluate the final predictions
accuracy = accuracy_score(final_predictions, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.497765136123527


In [111]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, labels, train_size=0.8, random_state=42)

prediction = clf.predict(X_test.replace({np.nan: 0}))
np.round(accuracy_score(prediction, y_test),4)

0.4978

In [153]:
# Instantiate XGBClassifier
clf = xgb.XGBClassifier(n_estimators=20, eta=0, booster='gbtree',
                         tree_method='hist', max_depth=4, learning_rate=0.09, objective='multi:softprob',
                            num_class=3, eval_metric='mlogloss', random_state=42)
# Fit the model
clf.fit(X_train.replace({np.nan: 0}), y_train)

# Make predictions on the test set
prediction = clf.predict(X_test.replace({np.nan: 0}))

# Calculate accuracy
test_accuracy = accuracy_score(prediction, y_test)

print(np.round(test_accuracy, 4))

0.4961


In [49]:
# Instantiate XGBClassifier
clf = xgb.XGBClassifier(n_estimators=20, eta=0, booster='gbtree',
                         tree_method='hist', max_depth=4, learning_rate=0.09, objective='multi:softprob',
                            num_class=3, eval_metric='mlogloss', random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(clf, train_data.replace({np.nan: 0}), labels, cv=10)  # 5-fold cross-validation

# Fit the model
clf.fit(X_train.replace({np.nan: 0}), y_train)

# Make predictions on the test set
prediction = clf.predict(X_test.replace({np.nan: 0}))

# Calculate accuracy
test_accuracy = accuracy_score(prediction, y_test)

# Print cross-validation scores and test accuracy
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))
print("Test accuracy:", np.round(test_accuracy, 4))


Cross-validation scores: [0.49878148 0.47684809 0.47928513 0.48861789 0.47804878 0.48780488
 0.49268293 0.48536585 0.50406504 0.48130081]
Mean CV accuracy: 0.4872800882354884
Test accuracy: 0.4961


In [186]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_data, labels_encoded, test_size=0.2, random_state=42)

# Get the number of features (columns) in the training data
n_features = X_train.shape[1]

# Define the model
model = Sequential()
model.add(Dense(15, activation='relu', input_shape=(n_features,)))
model.add(Dense(3, activation='softmax'))  # Output layer with 3 neurons for 3 classes

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=100)

# Make predictions on the test set
predictions = model.predict(X_test)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [94]:
predictions = pd.DataFrame(np.zeros((len(prediction), 3)), columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])
encoding = {1:'HOME_WINS' ,
             0: 'DRAW',
            2: 'AWAY_WINS'}
for i, p in enumerate(prediction):
    predictions.loc[i, encoding[p]] = 1
predictions = predictions.astype(int)

In [62]:
test_home_team_statistics_df = pd.read_csv(r'Test_Data\test_home_team_statistics_df.csv', index_col=0)
test_away_team_statistics_df = pd.read_csv(r'Test_Data\test_away_team_statistics_df.csv', index_col=0)

# Add a shooting accuracy feature:
test_home_team_statistics_df['Shooting_Accuracy'] = test_home_team_statistics_df['TEAM_SHOTS_ON_TARGET_season_sum'] / test_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum']
test_away_team_statistics_df['Shooting_Accuracy'] = test_away_team_statistics_df['TEAM_SHOTS_ON_TARGET_season_sum'] / test_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum']

# Passing Accuracy:
test_home_team_statistics_df['Passing_Accuracy'] = test_home_team_statistics_df['TEAM_SUCCESSFUL_PASSES_season_sum'] / test_home_team_statistics_df['TEAM_PASSES_season_sum']
test_away_team_statistics_df['Passing_Accuracy'] = test_away_team_statistics_df['TEAM_SUCCESSFUL_PASSES_season_sum'] / test_away_team_statistics_df['TEAM_PASSES_season_sum']

# Save Per Game Feature:
test_home_team_statistics_df['Save_Per_Game'] = test_home_team_statistics_df['TEAM_SAVES_season_sum'] / \
    (test_home_team_statistics_df['TEAM_GAME_WON_season_sum'] + test_home_team_statistics_df['TEAM_GAME_LOST_season_sum'] + test_home_team_statistics_df['TEAM_GAME_DRAW_season_sum'])
test_away_team_statistics_df['Save_Per_Game'] = test_away_team_statistics_df['TEAM_SAVES_season_sum'] / \
      (test_away_team_statistics_df['TEAM_GAME_WON_season_sum'] + test_away_team_statistics_df['TEAM_GAME_LOST_season_sum'] + test_away_team_statistics_df['TEAM_GAME_DRAW_season_sum'])

test_home_team_statistics_df['Attacking_Intensity'] = test_home_team_statistics_df['TEAM_SHOTS_INSIDEBOX_season_sum'] + test_home_team_statistics_df['TEAM_SHOTS_OUTSIDEBOX_season_sum']
test_away_team_statistics_df['Attacking_Intensity'] = test_away_team_statistics_df['TEAM_SHOTS_INSIDEBOX_season_sum'] + test_away_team_statistics_df['TEAM_SHOTS_OUTSIDEBOX_season_sum']

# Calculate Recent Performance
test_home_team_statistics_df['RECENT_PERFORMANCE'] = test_home_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] + test_home_team_statistics_df['TEAM_GAME_DRAW_5_last_match_sum'] / (test_home_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] + test_home_team_statistics_df['TEAM_GAME_DRAW_5_last_match_sum'] + test_home_team_statistics_df['TEAM_GAME_LOST_5_last_match_sum'])
test_away_team_statistics_df['RECENT_PERFORMANCE'] = test_away_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] + test_away_team_statistics_df['TEAM_GAME_DRAW_5_last_match_sum'] / (test_away_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] + test_away_team_statistics_df['TEAM_GAME_DRAW_5_last_match_sum'] + test_away_team_statistics_df['TEAM_GAME_LOST_5_last_match_sum'])

# Calculate Team Form
test_home_team_statistics_df['TEAM_FORM'] = (test_home_team_statistics_df['TEAM_GAME_WON_season_sum'] - test_home_team_statistics_df['TEAM_GAME_LOST_season_sum']) / test_home_team_statistics_df['TEAM_GAME_WON_season_sum']
test_away_team_statistics_df['TEAM_FORM'] = (test_away_team_statistics_df['TEAM_GAME_WON_season_sum'] - test_away_team_statistics_df['TEAM_GAME_LOST_season_sum']) / test_away_team_statistics_df['TEAM_GAME_WON_season_sum']

# Defensive Strength
test_home_team_statistics_df['DEFENSIVE_STRENGTH'] = test_home_team_statistics_df['TEAM_SAVES_season_sum'] + test_home_team_statistics_df['TEAM_YELLOWCARDS_season_sum'] + test_home_team_statistics_df['TEAM_REDCARDS_season_sum']
test_away_team_statistics_df['DEFENSIVE_STRENGTH'] = test_away_team_statistics_df['TEAM_SAVES_season_sum'] + test_away_team_statistics_df['TEAM_YELLOWCARDS_season_sum'] + test_away_team_statistics_df['TEAM_REDCARDS_season_sum']

# Offensive Strength
test_home_team_statistics_df['OFFENSIVE_STRENGTH'] = test_home_team_statistics_df['TEAM_CORNERS_season_sum'] + test_home_team_statistics_df['TEAM_PENALTIES_season_sum'] + test_home_team_statistics_df['TEAM_SUBSTITUTIONS_season_sum']
test_home_team_statistics_df['OFFENSIVE_STRENGTH'] = test_home_team_statistics_df['TEAM_CORNERS_season_sum'] + test_home_team_statistics_df['TEAM_PENALTIES_season_sum'] + test_home_team_statistics_df['TEAM_SUBSTITUTIONS_season_sum']


# Recent Performance Trend
test_home_team_statistics_df['SHOTS_TREND'] = test_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - test_home_team_statistics_df['TEAM_SHOTS_TOTAL_5_last_match_sum']
test_away_team_statistics_df['SHOTS_TREND'] = test_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - test_home_team_statistics_df['TEAM_SHOTS_TOTAL_5_last_match_sum']

# Standardized Metrics (hypothetical example)
test_home_team_statistics_df['STANDARDIZED_SHOTS'] = (test_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - test_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'].mean()) / test_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'].std()
test_away_team_statistics_df['STANDARDIZED_SHOTS'] = (test_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - test_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'].mean()) / test_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'].std()

# Relative Metrics
test_home_team_statistics_df['SHOTS_DIFF'] = test_home_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum'] - test_away_team_statistics_df['TEAM_SHOTS_TOTAL_season_sum']


# Win/Loss Ratio
test_home_team_statistics_df['WIN_LOSS_RATIO'] = test_home_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] / test_home_team_statistics_df['TEAM_GAME_LOST_5_last_match_sum']
test_away_team_statistics_df['WIN_LOSS_RATIO'] = test_away_team_statistics_df['TEAM_GAME_WON_5_last_match_sum'] / test_away_team_statistics_df['TEAM_GAME_LOST_5_last_match_sum']


# Shots Efficiency
test_home_team_statistics_df['SHOTS_EFFICIENCY_LAST_5'] = test_home_team_statistics_df['TEAM_SHOTS_ON_TARGET_5_last_match_sum'] / test_away_team_statistics_df['TEAM_SHOTS_TOTAL_5_last_match_sum']
test_away_team_statistics_df['SHOTS_EFFICIENCY_LAST_5'] = test_away_team_statistics_df['TEAM_SHOTS_ON_TARGET_5_last_match_sum'] / test_away_team_statistics_df['TEAM_SHOTS_TOTAL_5_last_match_sum']

# Passing Accuracy in Last 5 Matches
test_home_team_statistics_df['PASS_ACCURACY_LAST_5'] = test_home_team_statistics_df['TEAM_SUCCESSFUL_PASSES_5_last_match_sum'] / test_home_team_statistics_df['TEAM_PASSES_5_last_match_sum']
test_away_team_statistics_df['PASS_ACCURACY_LAST_5'] = test_away_team_statistics_df['TEAM_SUCCESSFUL_PASSES_5_last_match_sum'] / test_away_team_statistics_df['TEAM_PASSES_5_last_match_sum']

# Defensive Strength in Last 5 Matches
test_home_team_statistics_df['DEFENSIVE_STRENGTH_LAST_5'] = test_home_team_statistics_df['TEAM_SAVES_5_last_match_sum'] + test_home_team_statistics_df['TEAM_YELLOWCARDS_5_last_match_sum'] + test_home_team_statistics_df['TEAM_REDCARDS_5_last_match_sum']
test_away_team_statistics_df['DEFENSIVE_STRENGTH_LAST_5'] = test_away_team_statistics_df['TEAM_SAVES_5_last_match_sum'] + test_away_team_statistics_df['TEAM_YELLOWCARDS_5_last_match_sum'] + test_away_team_statistics_df['TEAM_REDCARDS_5_last_match_sum']

# Offensive Strength in Last 5 Matches
test_home_team_statistics_df['OFFENSIVE_STRENGTH_LAST_5'] = test_home_team_statistics_df['TEAM_CORNERS_5_last_match_sum'] + test_home_team_statistics_df['TEAM_PENALTIES_5_last_match_sum'] + test_home_team_statistics_df['TEAM_SUBSTITUTIONS_5_last_match_sum']
test_away_team_statistics_df['OFFENSIVE_STRENGTH_LAST_5'] = test_away_team_statistics_df['TEAM_CORNERS_5_last_match_sum'] + test_away_team_statistics_df['TEAM_PENALTIES_5_last_match_sum'] + test_away_team_statistics_df['TEAM_SUBSTITUTIONS_5_last_match_sum']

In [63]:
test_home_team = test_home_team_statistics_df
test_away_team = test_away_team_statistics_df

test_home_player_statistics_df = pd.read_csv(r'Test_Data\test_home_player_statistics_df.csv', index_col=0)
test_away_player_statistics_df = pd.read_csv(r'Test_Data\test_away_player_statistics_df.csv', index_col=0)

test_home_player = test_home_player_statistics_df.iloc[:,1:]
test_away_player = test_away_player_statistics_df.iloc[:,1:] 

def create_feature_player(X_train):
    # Player Form
    X_train['PLAYER_FORM_LAST_5'] = X_train['PLAYER_GOALS_5_last_match_sum'] + X_train['PLAYER_ASSISTS_5_last_match_sum']

    # Defensive Contribution
    X_train['PLAYER_DEFENSIVE_CONTRIBUTION_LAST_5'] = (X_train['PLAYER_INTERCEPTIONS_5_last_match_sum'] +
                                                    X_train['PLAYER_TACKLES_5_last_match_sum'] +
                                                    X_train['PLAYER_CLEARANCES_5_last_match_sum'])

    # Attacking Threat
    X_train['PLAYER_ATTACKING_THREAT_LAST_5'] = (X_train['PLAYER_SHOTS_ON_TARGET_5_last_match_sum'] +
                                                X_train['PLAYER_KEY_PASSES_5_last_match_sum'])

    # Discipline
    X_train['PLAYER_DISCIPLINE_LAST_5'] = (X_train['PLAYER_YELLOWCARDS_5_last_match_sum'] +
                                        X_train['PLAYER_REDCARDS_5_last_match_sum'])
    return X_train

test_home_player = create_feature_player(test_home_player.replace({np.nan: 0}))
test_away_player = create_feature_player(test_away_player.replace({np.nan: 0}))

col = ['PLAYER_FORM_LAST_5', 'PLAYER_DEFENSIVE_CONTRIBUTION_LAST_5', 'PLAYER_ATTACKING_THREAT_LAST_5', 'PLAYER_DISCIPLINE_LAST_5']

def add_feature_player(df, target_df):
    for c in df.columns:
        # First, calculate the total goals for each team
        moyenne = df.groupby(df.index)[c].mean()

        var = df.groupby(df.index)[c].std()

        target_df[c] = moyenne
        # target_df[f'{c}_std'] = var

    return target_df


test_home_team = add_feature_player(test_home_player.replace({np.nan: 0}), test_home_team)
test_away_team = add_feature_player(test_away_player.replace({np.nan: 0}), test_away_team)


test_home_team.columns = 'HOME_' + test_home_team.columns
test_away_team.columns = 'AWAY_' + test_away_team.columns


In [70]:
test_data =  pd.concat([test_home_team,test_away_team], join='inner', axis=1)

test_data = test_data.replace({np.inf:np.nan,-np.inf:np.nan})

ind = test_data.index

test_data = pd.DataFrame(scaler.transform(test_data))

In [104]:
# Initialize the SVC classifier
svm_classifier = SVC(kernel='rbf', C=0.245) # You can also use other kernels like 'rbf', 'poly', etc.

scaler = MinMaxScaler()
scaler.fit(train_data.replace({np.nan: 0}))
train_data_scaled = scaler.transform(train_data.replace({np.nan: 0}))
test_data_sclaed = scaler.transform(test_data.replace({np.nan: 0}))

# Train the classifier on the training data
svm_classifier.fit(train_data_scaled, labels)

# Make predictions on the testing data
y_pred = svm_classifier.predict(test_data_sclaed)

In [83]:
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [106]:
predictions = pd.DataFrame(np.zeros((len(y_pred), 3)), columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])
encoding = {1:'HOME_WINS' ,
             0: 'DRAW',
            2: 'AWAY_WINS'}
for i, p in enumerate(y_pred):
    predictions.loc[i, encoding[p]] = 1
predictions = predictions.astype(int)

In [137]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming you have your data loaded into X and y

# Step 1: Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.2, random_state=42)

# Step 2: Train RF and XGBoost Models Independently

rf_model = RandomForestClassifier(n_estimators=79, max_depth=11, criterion="gini", min_samples_split=25, random_state=42)
xgb_model = xgb.XGBClassifier(n_estimators=15, eta=0, booster='gbtree',
                         tree_method='hist', max_depth=4, learning_rate=0.09, objective='multi:softprob',
                            num_class=3, eval_metric='mlogloss', random_state=42)

rf_model.fit(X_train.replace({np.nan: 0}), y_train)
xgb_model.fit(X_train.replace({np.nan: 0}), y_train)

# Step 3: Make Predictions on the Validation Set
rf_pred_val = rf_model.predict(X_test.replace({np.nan: 0}))
xgb_pred_val = xgb_model.predict(X_test.replace({np.nan: 0}))

# Step 4: Stacking/Blending
stacked_features_val = np.column_stack((rf_pred_val, xgb_pred_val))

# Step 5: Train Meta-Model
meta_model = RandomForestClassifier(n_estimators=8, max_depth=2, min_samples_split=3, random_state=42)
meta_model.fit(stacked_features_val, y_val)

# Step 6: Make Final Predictions on the Test Set
rf_pred_test = rf_model.predict(test_data.replace({np.nan: 0}))
xgb_pred_test = xgb_model.predict(test_data.replace({np.nan: 0}))
stacked_features_test = np.column_stack((rf_pred_test, xgb_pred_test))
final_predictions = meta_model.predict(stacked_features_test)


**Predicitons Using RandomForest**

In [126]:
scaler = MinMaxScaler()
scaler.fit(train_data.replace({np.nan: 0}))
train_data_scaled = scaler.transform(train_data.replace({np.nan: 0}))
test_data_scaled = scaler.transform(test_data.replace({np.nan: 0}))

clf = RandomForestClassifier(n_estimators=73, max_depth=8, criterion="gini", min_samples_split=26, random_state=42)

clf.fit(train_data_scaled, labels)
prediction = clf.predict(test_data_scaled)
predictions = pd.DataFrame(np.zeros((len(prediction), 3)), columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])
encoding = {1:'HOME_WINS' ,
             0: 'DRAW',
            2: 'AWAY_WINS'}
for i, p in enumerate(prediction):
    predictions.loc[i, encoding[p]] = 1
predictions = predictions.astype(int)

In [135]:
train_data

Unnamed: 0_level_0,HOME_TEAM_SHOTS_TOTAL_season_sum,HOME_TEAM_SHOTS_INSIDEBOX_season_sum,HOME_TEAM_SHOTS_OFF_TARGET_season_sum,HOME_TEAM_SHOTS_ON_TARGET_season_sum,HOME_TEAM_SHOTS_OUTSIDEBOX_season_sum,HOME_TEAM_PASSES_season_sum,HOME_TEAM_SUCCESSFUL_PASSES_season_sum,HOME_TEAM_SAVES_season_sum,HOME_TEAM_CORNERS_season_sum,HOME_TEAM_FOULS_season_sum,...,AWAY_PLAYER_TOTAL_DUELS_5_last_match_std,AWAY_PLAYER_YELLOWCARDS_5_last_match_std,AWAY_PLAYER_PUNCHES_5_last_match_std,AWAY_PLAYER_LONG_BALLS_5_last_match_std,AWAY_PLAYER_LONG_BALLS_WON_5_last_match_std,AWAY_PLAYER_SHOTS_OFF_TARGET_5_last_match_std,AWAY_PLAYER_FORM_LAST_5,AWAY_PLAYER_DEFENSIVE_CONTRIBUTION_LAST_5,AWAY_PLAYER_ATTACKING_THREAT_LAST_5,AWAY_PLAYER_DISCIPLINE_LAST_5
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,2.0,5.0,2.0,1.0,2.0,2.0,5.0,3.0,6.0,...,30.666667,37.333333,0.0,0.0,0.0,0.0,11.388889,52.611111,19.777778,27.777778
1,6.0,8.0,3.0,6.0,5.0,8.0,7.0,10.0,6.0,8.0,...,32.333333,29.166667,0.0,0.0,0.0,0.0,6.111111,63.722222,16.222222,18.388889
2,4.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0,2.0,7.0,...,32.086957,35.913043,0.0,0.0,0.0,0.0,10.695652,50.304348,24.565217,19.130435
3,7.0,5.0,5.0,6.0,6.0,9.0,9.0,2.0,2.0,0.0,...,27.000000,18.277778,0.0,0.0,0.0,0.0,14.611111,47.777778,20.222222,23.944444
4,3.0,3.0,2.0,3.0,4.0,4.0,3.0,4.0,4.0,7.0,...,22.666667,21.388889,0.0,0.0,0.0,0.0,19.666667,41.833333,37.722222,9.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12298,4.0,2.0,3.0,4.0,7.0,4.0,4.0,3.0,2.0,2.0,...,22.833333,31.555556,0.0,0.0,0.0,0.0,16.111111,54.277778,37.777778,25.722222
12299,4.0,2.0,3.0,1.0,5.0,1.0,1.0,9.0,1.0,10.0,...,24.833333,26.055556,0.0,0.0,0.0,0.0,5.833333,65.444444,16.388889,19.444444
12300,4.0,3.0,5.0,3.0,5.0,1.0,1.0,6.0,1.0,8.0,...,17.944444,20.611111,0.0,0.0,0.0,0.0,12.500000,57.555556,30.722222,15.277778
12301,2.0,,1.0,1.0,,,,0.0,4.0,2.0,...,0.000000,25.277778,0.0,0.0,0.0,0.0,26.222222,0.000000,0.000000,11.111111


In [132]:
pd.DataFrame(test_data_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,912,913,914,915,916,917,918,919,920,921
0,0.03,0.06,0.05,0.01,0.03,0.09,0.08,0.10,0.06,0.07,...,0.009251,0.004871,0.010569,0.000000,0.000000,0.000000,0.001311,0.004342,0.007205,0.008676
1,0.03,0.02,0.04,0.02,0.06,0.01,0.01,0.02,0.04,0.10,...,0.012480,0.003400,0.028655,5.666667,4.944444,7.388889,0.006585,0.004566,0.008244,0.005609
2,0.07,0.10,0.08,0.08,0.01,0.04,0.05,0.00,0.10,0.04,...,0.008392,0.003400,0.014562,14.277778,15.888889,7.222222,0.005448,0.006294,0.006625,0.005609
3,0.05,0.07,0.05,0.09,0.05,0.07,0.06,0.03,0.05,0.08,...,0.008255,0.007912,0.000000,0.000000,0.000000,0.000000,0.004969,0.005387,0.006459,0.013962
4,0.05,0.04,0.06,0.03,0.05,0.00,0.00,0.02,0.02,0.09,...,0.011380,0.004287,0.016911,0.000000,0.000000,0.000000,0.002155,0.006323,0.008226,0.007284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25363,0.05,0.05,0.05,0.05,0.07,0.06,0.06,0.04,0.06,0.08,...,0.012972,0.004112,0.000000,0.000000,0.000000,0.000000,0.007822,0.006121,0.009146,0.007236
25364,0.05,0.05,0.02,0.08,0.03,0.07,0.07,0.02,0.02,0.02,...,0.006968,0.006437,0.000000,0.000000,0.000000,0.000000,0.007418,0.004627,0.009174,0.006176
25365,0.10,0.10,0.10,0.08,0.08,0.05,0.05,0.02,0.06,0.00,...,0.006997,0.004250,0.000000,15.611111,14.333333,12.166667,0.005468,0.006575,0.006846,0.008498
25366,0.04,0.03,0.05,0.03,0.07,0.07,0.07,0.00,0.04,0.00,...,0.009403,0.001658,0.000000,0.000000,0.000000,0.000000,0.003289,0.006836,0.005910,0.002277


In [127]:
predictions.index = ind
submission = predictions.reset_index()
submission.to_csv('benchmark_submissionRF.csv', index=False)

In [129]:
benchmark1 = pd.read_csv('benchmark_submissionRandomForest_Latest_(4).csv', index_col=0)
benchmark2 = pd.read_csv('benchmark_submission_RandomForest.csv', index_col=0)

benchmark4 = pd.read_csv('benchmark_submission_XGBoost.csv', index_col=0)

#accuracy_score(benchmark1, predictions)

print(accuracy_score(benchmark1, predictions))
print(accuracy_score(benchmark2, predictions))

print(accuracy_score(benchmark4, predictions))

0.7351781772311574
0.7338379060233365
0.7142857142857143
