#import data

In [121]:
import pandas as pd

In [122]:
df=pd.read_csv('/content/dfs.csv')

In [123]:
df = df.iloc[:356]

In [124]:
# Check for NaN values in the entire DataFrame
nan_in_df = df.isna().sum().sum()

print(f'Total NaN values in DataFrame: {nan_in_df}')

# If you want to check for NaN values in each column
nan_in_each_column = df.isna().sum()

print(f'NaN values in each column:\n{nan_in_each_column}')


Total NaN values in DataFrame: 0
NaN values in each column:
Country             0
Decision            0
Team 1              0
Team 2              0
Winner              0
Ground              0
team composition    0
weather_status      0
strike_rate         0
Margin_Wickets      0
Margin_Runs         0
dtype: int64


#Encoding


In [125]:
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame

# Create a LabelEncoder
le = LabelEncoder()

# Fit the LabelEncoder on all unique teams
teams = pd.concat([df['Team 1'], df['Team 2'], df['Winner'], df['Country']]).unique()
le.fit(teams)

# Transform the 'Team 1', 'Team 2', and 'Winner' columns
df['Team 1'] = le.transform(df['Team 1'])
df['Team 2'] = le.transform(df['Team 2'])
df['Winner'] = le.transform(df['Winner'])
df['Country'] = le.transform(df['Country'])

# Encode the other columns as before
df['Decision'] = le.fit_transform(df['Decision'])
df['Ground'] = le.fit_transform(df['Ground'])


#team composition process

In [126]:
df['team composition'] = df['team composition'].str.strip('[]').str.split(', ').apply(lambda x: [int(i) for i in x])

In [127]:
df['strike_rate'] = df['strike_rate'].str.strip('[]').str.split(', ').apply(lambda x: [float(i) if i != '-' else None for i in x])


In [103]:
df.to_csv('/content/dfs.csv',index=False)

## test split

In [128]:
import numpy as np

# Convert lists to a 2D array-like
# y = np.array(df[['team composition','strike_rate']].to_list())
y = df['strike_rate'].tolist()

In [129]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
X = df.drop(['team composition','strike_rate'], axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Model Testing

In [24]:
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.ensemble import RandomForestRegressor


# # Convert categorical columns to category codes
# for col in ['Team 1', 'Team 2', 'Winner', 'Ground']:
#     df[col] = df[col].astype('category').cat.codes



# # Train the model
# model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
# model.fit(X_train, y_train)

# # Make predictions
# predictions = model.predict(X_test)

In [None]:
# import numpy as np

# # Round predictions to the nearest integer
# rounded_predictions = np.round(predictions).astype(int)

# print(rounded_predictions)

In [None]:
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Calculate MAE, MSE, and R^2
# mae = mean_absolute_error(y_test, predictions)
# mse = mean_squared_error(y_test, predictions)
# r2 = r2_score(y_test, predictions)

# print(f'Mean Absolute Error: {mae}')
# print(f'Mean Squared Error: {mse}')
# print(f'R^2 Score: {r2}')

Mean Absolute Error: 1.505159136002886
Mean Squared Error: 4.186002696649925
R^2 Score: -0.3362177079115329


In [None]:
from joblib import dump

# Save the model to a file
dump(model, 'model_RF.joblib')

['model_RF.joblib']

## Random Forest Regressor with Grid Search


In [130]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Define the parameter grid
param_grid = {
    'n_estimators': [10, 50],  # Number of trees in the forest
    'max_depth': [2,4],  # Maximum depth of the tree
    'min_samples_split': [2, 5]  # Minimum number of samples required to split a node

}

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=2, n_jobs=-1)

# Fit the GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_


In [131]:
# Initialize the RandomForestRegressor with the best parameters
best_rf = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                                max_depth=best_params['max_depth'],
                                min_samples_split=best_params['min_samples_split'],
                                random_state=42)

# Fit the model to the training data
best_rf.fit(X_train, y_train)


In [132]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the test set
predictions = best_rf.predict(X_test)
y_test = np.nan_to_num(y_test)
predictions = np.nan_to_num(predictions)
# Calculate the metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Mean Absolute Error: 1.410682234684678
# Mean Squared Error: 3.462551391667395
# R^2 Score: -0.06239412438597833

# Mean Absolute Error: 1.3959417059199792
# Mean Squared Error: 3.367518680120904
# R^2 Score: -0.03717025289564917

# Mean Absolute Error: 1.4079923701372685
# Mean Squared Error: 3.362731761761384
# R^2 Score: -0.02507022296826499

Mean Absolute Error: 13.517417012597138
Mean Squared Error: 419.53721155216175
R^2 Score: -0.006431997190859942


In [134]:
from joblib import dump

# Save the model to a file
dump(best_rf, 'model_RF with GridSearch - strike rates.joblib')

['model_RF with GridSearch - strike rates.joblib']

##Gradient Boost Regressor

In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=42)

# Make it a multi-output regressor
mor = MultiOutputRegressor(gbr)

# Fit the model to the training data
mor.fit(X_train, y_train)

# Make predictions on the test set
predictions = mor.predict(X_test)

# Calculate the metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Absolute Error: 1.5078911212045698
Mean Squared Error: 4.292472283325814
R^2 Score: -0.3515037396559127


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Define the parameter grid
param_grid = {
    'estimator__n_estimators': [10, 20, 50,100], # Number of trees in the forest
    'estimator__max_depth': [None, 10, 20, 40],  # Maximum depth of the tree
    'estimator__min_samples_split': [2, 5, 10,20]  # Minimum number of samples required to split a node
}

# Initialize the GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=42)

# Make it a multi-output regressor
mor = MultiOutputRegressor(gbr)

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=mor, param_grid=param_grid, cv=3, n_jobs=-1)

# Fit the GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Train the model with the best parameters
best_mor = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=best_params['estimator__n_estimators'],
                                                          max_depth=best_params['estimator__max_depth'],
                                                          min_samples_split=best_params['estimator__min_samples_split'],
                                                          random_state=42))

best_mor.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the test set
predictions = best_mor.predict(X_test)

# Calculate the metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')
# Mean Absolute Error: 1.4418162825241767
# Mean Squared Error: 3.8191932734546765
# R^2 Score: -0.1955422362193618

Mean Absolute Error: 1.440126542640117
Mean Squared Error: 3.695990152253219
R^2 Score: -0.16719128993072838


In [None]:
from joblib import dump

# Save the model to a file
dump(best_mor, 'model_GBR with GridSearch.joblib')


['model_GBR with GridSearch.joblib']

##Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Convert categorical columns to category codes
for col in ['Team 1', 'Team 2', 'Winner', 'Ground']:
    df[col] = df[col].astype('category').cat.codes


# Initialize the DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=42)

# Fit the model to the training data
dtr.fit(X_train, y_train)

# Make predictions on the test set
predictions = dtr.predict(X_test)

# Calculate the metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Absolute Error: 1.7398989898989898
Mean Squared Error: 6.9103535353535355
R^2 Score: -1.1167287976510079


In [None]:
from sklearn.model_selection import cross_val_score

# Convert categorical columns to category codes
for col in ['Team 1', 'Team 2', 'Winner', 'Ground']:
    df[col] = df[col].astype('category').cat.codes

# Split the data into features (X) and target (y)
X = df.drop('team composition', axis=1)

# Initialize the DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=42)

# Perform 5-fold cross validation
scores = cross_val_score(dtr, X, y, cv=5)

print("Cross-validated scores:", scores)
print("Average score:", scores.mean())


Cross-validated scores: [-2.4366832  -1.2756409  -1.2744483  -1.01001692 -1.87142375]
Average score: -1.5736426135629993


In [None]:
from joblib import dump

# Save the model to a file
dump(dtr, 'model_DT.joblib')


['model_DT.joblib']