In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("Training_final_v1.0.csv")

In [4]:
X = df.drop(columns=['result'])
y = df['result']

In [5]:
categorical_features = ['season_x', 'team_x', 'opp_team_name']
numerical_features = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored', 'ict_index', 'influence', 'minutes', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'selected', 'team_a_score', 'team_h_score', 'threat', 'total_points', 'transfers_balance', 'transfers_in', 'transfers_out', 'value', 'yellow_cards', 'GW']

# Define preprocessing steps for categorical and numerical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

# Combine preprocessing steps for all features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Create a pipeline with preprocessing and the Random Forest Classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.875


In [6]:
# training with SVM
from sklearn.svm import SVC


In [7]:
# Create a pipeline with preprocessing and the SVM Classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())  # Use SVC instead of RandomForestClassifier
])


In [8]:
# Train the model using the pipeline
pipeline.fit(X_train, y_train)


In [9]:
# Evaluate the model
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.8702830188679245


In [10]:
# Training model with KNN
from sklearn.neighbors import KNeighborsClassifier


In [11]:
# Create a pipeline with preprocessing and the KNN Classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())  # Use KNeighborsClassifier instead of SVC
])


In [12]:
# Train the model using the pipeline
pipeline.fit(X_train, y_train)


In [13]:
# Evaluate the model
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.8160377358490566


# Training Model Using random Forest

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [15]:
# Assuming X_flat is your 2D input data of shape (num_samples, num_features) and y is your target labels
# Reshape X_flat back to the original shape (3920, 11, 8)

import numpy as np

# Define X_flat with your actual input data
# For demonstration purposes, let's assume X_flat is a numpy array of zeros
num_samples = 3920
num_features = 11 * 8
X_flat = np.zeros((num_samples, num_features))

# Reshape X_flat back to the original shape (3920, 11, 8)
num_matches = X_flat.shape[0]
num_time_steps = 11
num_features = 8
X_reshaped = X_flat.reshape(num_matches, num_time_steps, num_features)

In [16]:
# Flatten X_3d to get the original 2D array (3920, 88)
X_original = X_reshaped.reshape(num_matches, -1)

In [17]:
# Flatten X_reshaped to get the original 2D array (3920, 88)
X_original = X_reshaped.reshape(num_matches, -1)

In [18]:
# Define your target labels 'y' with some example data
y = np.random.randint(2, size=num_matches)  # Example target labels, replace with your actual data

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_original, y, test_size=0.2, random_state=42)

In [19]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_original, y, test_size=0.2, random_state=42)


In [20]:
# Handling NaN values
X_train[np.isnan(X_train)] = 0
X_test[np.isnan(X_test)] = 0

In [21]:
# Define and train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [22]:
# Predict on the test set
y_pred_rf = rf_classifier.predict(X_test)

In [23]:
# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Test Accuracy:", accuracy_rf)

Random Forest Test Accuracy: 0.49744897959183676


# Training Using SVM Model

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_flat, y, test_size=0.2, random_state=42)

# Replace NaN values with zeros in the training and test data
X_train[np.isnan(X_train)] = 0
X_test[np.isnan(X_test)] = 0

# Standardize features (optional but recommended for some models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and train an SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:",accuracy)

Test Accuracy: 0.49744897959183676
