In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
matches = pd.read_excel('matches.xlsx')
deliveries = pd.read_excel('deliveries.xlsx')
matches.head()

In [None]:
matches.shape,deliveries.shape

In [None]:
deliveries

In [None]:
deliveries.columns

In [None]:
# grouping the 1st innings,2nd innings score in a particular matchid
# lets say match id = 1,so inning 1 score = 207,inning 2 score = 172,in that way

totalrun_df = deliveries.groupby(['Match_id','Inning Number']).sum()['Total'].reset_index()
totalrun_df

In [None]:
# capturing only the first innings,as we will be predicting for the second innnigs

totalrun_df = totalrun_df[totalrun_df['Inning Number']==1]
totalrun_df['Total'] = totalrun_df['Total'].apply(lambda x:x+1)
totalrun_df

In [None]:
matches.columns

In [None]:
'''
Merging the total first innings score df with the matches df,
where left side merging is done on "id" column of the matches
and right side merging is done on "match_id" column of the totalrun_df

This is an inner join. The inner join returns only the rows that have matching values in both tables, 
in this case, the 'matches' DataFrame and the 'totalrun_df' DataFrame. 
It returns only the rows where the 'id' column in the "matches" DataFrame has a match in the 'match_id' 
column of the "totalrun_df" DataFrame.

'''

match_df = matches.merge(totalrun_df[['Match_id','Total']],
                       left_on='id',right_on='Match_id')

match_df

In [None]:
match_df['team1'].unique()

In [None]:
teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bangalore',
    'Kolkata Knight Riders',
    'Kings XI Punjab',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals',
    'Rising Pune Supergiant',
    'Gujarat Lions',
    'Delhi Daredevils',
]

In [None]:
# replacing the Delhi Daredevils with Delhi Capitals

match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils','Delhi Capitals')


# replacing the Deccan Chargers with Sunrises Hyderabad

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')



In [None]:
# will consider only frequently occuring teams,
# which are mentioned in the teams list

match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]

match_df['team1'].unique()

In [None]:
match_df.shape

In [None]:
match_df.head()

In [None]:
deliveries.head(3)

In [None]:
# merging matchdf with delevieries on match_id

delivery_df = match_df.merge(deliveries,on='Match_id')

delivery_df.head(5)

In [None]:
delivery_df.columns

In [None]:
# considering the 2nd innings because we have to keep a check on the current score of second innings

delivery_df = delivery_df[delivery_df['Inning Number'] == 2]
delivery_df.head()

In [None]:
delivery_df.shape

In [None]:

# current score of particular match

delivery_df['current_score'] = delivery_df.groupby('Match_id')['Total_y'].cumsum()

delivery_df.head()

In [None]:
# runs left 

delivery_df['runs_left'] = delivery_df['Total_x']-delivery_df['current_score']

delivery_df.head()

In [None]:
'''
if one ball is played,then balls left = 120-1 = 119
if two balls are played,then balls left = 120-2 = 118

so similarly if over=1,over has 6 balls right,so 1*6 = 6
now,ball = 1,so 6+1 = 7,now 126-7 = 119,which is same as (1)

so we'll use balls_left = 126-(over*6+current_ball)

'''

# balls left


delivery_df['balls_left'] = 120-(delivery_df['Over']*6+delivery_df['Ball'])

delivery_df

In [None]:
list(delivery_df['player_dismissed'].unique())[:2]

In [None]:

# filling nan values with "0"

delivery_df['player_dismissed'] = delivery_df['player_dismissed'].fillna("0")

# now we will convert this player_dismissed col into a boolean col
# if the player is not dismissed then it's 0 else its 1

delivery_df['player_dismissed'] = delivery_df['player_dismissed'].apply(lambda x:x if x=="0" else "1")

# converting string to int

delivery_df['player_dismissed'] = delivery_df['player_dismissed'].astype('int')


delivery_df['player_dismissed'].unique()


In [None]:
# wickets left

wickets = delivery_df.groupby('Match_id')['player_dismissed'].cumsum().values

delivery_df['wickets_left'] = 10-wickets

delivery_df

In [None]:
# current run rate
# It is a common practice to express run rates in cricket as runs per over, so the score is multiplied by 6.


delivery_df['cur_run_rate'] = (delivery_df['current_score']*6)/(120-delivery_df['balls_left']) 

# required run rate

delivery_df['req_run_rate'] = (delivery_df['runs_left']*6)/(delivery_df['balls_left'])


delivery_df.head(3)

In [None]:
def resultfun(row):
    return 1 if row['Batting_team'] == row['winner'] else 0

In [None]:
delivery_df['result'] = delivery_df.apply(resultfun,axis=1)
delivery_df.head()

In [None]:
# sn.countplot(delivery_df['result'])

In [None]:
final_df = delivery_df[['Batting_team','Bowling_team','city','runs_left',
                        'balls_left','wickets_left','Total_x','cur_run_rate',
                        'req_run_rate','Predicted_total_run_batter','result']]

final_df.head()

In [None]:
final_df.shape

In [None]:
final_df.isnull().sum()

In [None]:
# dropping of null values


final_df = final_df.dropna()

final_df.isnull().sum()

In [None]:
final_df[['runs_left', 'balls_left', 'wickets_left', 'Total_x',
    'cur_run_rate', 'req_run_rate']]

In [None]:
final_df = final_df[final_df['balls_left'] != 0]

In [None]:
final_df.to_excel("final_data.xlsx", index=False)

In [4]:
final_df = pd.read_excel("final_data.xlsx")
final_df.head()

Unnamed: 0,Batting_team,Bowling_team,city,runs_left,balls_left,wickets_left,Total_x,cur_run_rate,req_run_rate,Predicted_total_run_batter,result
0,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,119,10,208,6.0,10.436975,54.775223,0
1,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,118,10,208,3.0,10.525424,30.693213,0
2,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,117,10,208,2.0,10.615385,30.72966,0
3,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,205,116,10,208,4.5,10.603448,30.766109,0
4,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,201,115,10,208,8.4,10.486957,30.802555,0


In [5]:
data = final_df.copy()

test = data['result']
# 
train = data.drop(['result'],axis = 1)

# Replace inf and negative values in 'cur_run_rate' with zeros
train['cur_run_rate'] = train['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# train['Batting_team'].unique()
# train.head()

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Define X (features) and y (target)
X = train
y = test

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)

# Create and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=45)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# You can also print a classification report for more detailed performance metrics
print(classification_report(y_test, y_pred))


Accuracy: 0.9483358847876623
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     42113
           1       0.95      0.95      0.95     46072

    accuracy                           0.95     88185
   macro avg       0.95      0.95      0.95     88185
weighted avg       0.95      0.95      0.95     88185


In [7]:
from sklearn.model_selection import train_test_split

# Define X (features) and y (target)
X = train  # Assuming train contains your feature data
y = test   # Assuming test contains your target data

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Perform Train-Validation-Test Split
# First, split the data into training and the rest (combined validation and test)
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.7, random_state=42)

# Then, split the rest into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.7, random_state=42)
depth = [5,10,15,20,50]
res = []
for dpt in depth:
    
    # Create and train the RandomForestClassifier on the training set
    clf = RandomForestClassifier(n_estimators=10, random_state=45, max_depth=dpt)
    clf.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_val_pred = clf.predict(X_val)
    
    # Evaluate the classifier's performance on the validation set
    validation_accuracy = accuracy_score(y_val, y_val_pred)
    print("Validation Accuracy:", validation_accuracy)
    
    # Make predictions on the test set
    y_test_pred = clf.predict(X_test)
    
    # Evaluate the classifier's performance on the test set
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Test Accuracy:", test_accuracy)

    # You can also print classification reports for more detailed performance metrics
    print("Validation Classification Report:")
    print(classification_report(y_val, y_val_pred))
    
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    rn = [dpt, validation_accuracy, test_accuracy]
    res.append(rn)
res_df = pd.DataFrame.from_records(res)  
print(res_df)


Validation Accuracy: 0.7906298600311042
Test Accuracy: 0.7869535334180326
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.71      0.76      9832
           1       0.76      0.87      0.81     10744

    accuracy                           0.79     20576
   macro avg       0.80      0.79      0.79     20576
weighted avg       0.80      0.79      0.79     20576

Test Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.70      0.76     22861
           1       0.76      0.87      0.81     25152

    accuracy                           0.79     48013
   macro avg       0.79      0.78      0.78     48013
weighted avg       0.79      0.79      0.78     48013
Validation Accuracy: 0.8403479782270606
Test Accuracy: 0.8418761585403953
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.82      0.83     

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Define X (features) and y (target)
X = train  # Assuming train contains your feature data
y = test   # Assuming test contains your target data

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
ts_size = [0.2,0.3,0.4,0.5]
qwe= []
for asd in ts_size:    
    # Perform Train-Validation-Test Split
    # First, split the data into training and the rest (combined validation and test)
    X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=asd, random_state=42)
    
    # Then, split the rest into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=asd, random_state=42)
    
    # Create and train the Gaussian Naive Bayes classifier on the training set
    clf_nb = GaussianNB()
    clf_nb.fit(X_train, y_train)
    
    # Make predictions on the validation set for Naive Bayes
    y_val_pred_nb = clf_nb.predict(X_val)
    
    # Evaluate the classifier's performance on the validation set for Naive Bayes
    validation_accuracy_nb = accuracy_score(y_val, y_val_pred_nb)
    
    # Make predictions on the test set for Naive Bayes
    y_test_pred_nb = clf_nb.predict(X_test)
    
    # Evaluate the classifier's performance on the test set for Naive Bayes
    test_accuracy_nb = accuracy_score(y_test, y_test_pred_nb)
    
    # Print validation and test accuracy
    print("Validation Accuracy (Naive Bayes):", validation_accuracy_nb)
    print("Test Accuracy (Naive Bayes):", test_accuracy_nb)
    
    # You can also print classification reports for more detailed performance metrics
    print("Validation Classification Report (Naive Bayes):")
    print(classification_report(y_val, y_val_pred_nb))
    
    print("Test Classification Report (Naive Bayes):")
    print(classification_report(y_test, y_test_pred_nb))
    
    rn = [asd, validation_accuracy_nb, test_accuracy_nb]
    qwe.append(rn)
res_df = pd.DataFrame.from_records(qwe)  
print(res_df)


In [None]:
# final_df = pd.read_excel("final_data.xlsx")
# data = final_df.copy()
# print(data)
# train_target = data['result']
# train_features = data.drop(['result'],axis = 1)
# # Replace inf and negative values in 'cur_run_rate' with zeros
# train_features['cur_run_rate'] = train_features['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)
# 
# testing_df = pd.read_excel("testing_data.xlsx")
# validation_data = testing_df.copy()
# 
# validate_target = validation_data['result']
# validate_features = validation_data.drop(['result'],axis = 1)
# # Replace inf and negative values in 'cur_run_rate' with zeros
# validate_features['cur_run_rate'] = validate_features['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the training data
final_df = pd.read_excel("final_data.xlsx")
data = final_df.copy()

# Extract features and target for training data
train_target = data['result']
train_features = data.drop(['result'], axis=1)

# Replace inf and negative values in 'cur_run_rate' with zeros
train_features['cur_run_rate'] = train_features['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# Load the validation data
testing_df = pd.read_excel("testing_data.xlsx")
validation_data = testing_df.copy()

# Extract features and target for validation data
validate_target = validation_data['result']
validate_features = validation_data.drop(['result'], axis=1)

# Replace inf and negative values in 'cur_run_rate' with zeros for validation data
validate_features['cur_run_rate'] = validate_features['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# Combine the training and validation data for consistent label encoding
combined_data = pd.concat([train_features, validate_features], axis=0)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    combined_data[col] = label_encoders[col].fit_transform(combined_data[col])

# Split the combined data back into training and validation
X1 = combined_data[:len(train_features)]
X2 = combined_data[len(train_features):]

# Create and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=45)
clf.fit(X1, train_target)

# Make predictions on the validation set
validate_pred = clf.predict(X2)

# Evaluate the classifier's performance on the validation set
accuracy = accuracy_score(validate_target, validate_pred)
print("Validation Accuracy:", accuracy)

# You can also print a classification report for more detailed performance metrics
print(classification_report(validate_target, validate_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the training data
final_df = pd.read_excel("final_data.xlsx")
data = final_df.copy()

# Extract features and target for training data
train_target = data['result']
train_features = data.drop(['result'], axis=1)

# Replace inf and negative values in 'cur_run_rate' with zeros
train_features['cur_run_rate'] = train_features['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# Load the validation data
testing_df = pd.read_excel("testing_data.xlsx")
validation_data = testing_df.copy()

# Extract features and target for validation data
validate_target = validation_data['result']
validate_features = validation_data.drop(['result'], axis=1)

# Replace inf and negative values in 'cur_run_rate' with zeros for validation data
validate_features['cur_run_rate'] = validate_features['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# Combine the training and validation data for consistent label encoding
combined_data = pd.concat([train_features, validate_features], axis=0)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    combined_data[col] = label_encoders[col].fit_transform(combined_data[col])

# Split the combined data back into training and validation
X1 = combined_data[:len(train_features)]
X2 = combined_data[len(train_features):]

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create and train the RandomForestClassifier with GridSearchCV
clf = RandomForestClassifier(random_state=45)
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X1, train_target)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Make predictions on the validation set using the best model
best_clf = grid_search.best_estimator_
validate_pred = best_clf.predict(X2)

# Evaluate the classifier's performance on the validation set
accuracy = accuracy_score(validate_target, validate_pred)
print("Validation Accuracy:", accuracy)

# You can also print a classification report for more detailed performance metrics
print(classification_report(validate_target, validate_pred))


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Create a confusion matrix to visualize true positives, true negatives, false positives, and false negatives
conf_matrix = confusion_matrix(validate_target, validate_pred)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Lost', 'Won'], yticklabels=['Lost', 'Won'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plot feature importances using the best classifier
if isinstance(best_clf, RandomForestClassifier):
    feature_importances = best_clf.feature_importances_
    feature_names = train_features.columns

    # Sort features by importance in descending order
    sorted_idx = feature_importances.argsort()[::-1]

    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances')
    plt.bar(range(train_features.shape[1]), feature_importances[sorted_idx], align='center')
    plt.xticks(range(train_features.shape[1]), feature_names[sorted_idx], rotation=90)
    plt.tight_layout()
    plt.show()



In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import GridSearchCV
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# 
# # Load the data
# final_df = pd.read_excel("final_data.xlsx")
# 
# # Extract features and target
# X = final_df.drop(['result'], axis=1)
# y = final_df['result']
# 
# # Replace inf and negative values in 'cur_run_rate' with zeros
# X['cur_run_rate'] = X['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)
# 
# # Encode categorical variables
# label_encoders = {}
# categorical_columns = ['Batting_team', 'Bowling_team', 'city']
# 
# for col in categorical_columns:
#     label_encoders[col] = LabelEncoder()
#     X[col] = label_encoders[col].fit_transform(X[col])
# 
# # Split the data into training and validation
# X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2, random_state=45)
# 
# # Define the parameter grid with the specified hyperparameters
# param_grid = {
#     'n_estimators': [200],  # Set to 200
#     'max_depth': [10],  # Set to 10
#     'min_samples_split': [2],  # Set to 2
#     'min_samples_leaf': [2]  # Set to 2
# }
# 
# # Create and train the RandomForestClassifier with GridSearchCV
# clf = RandomForestClassifier(random_state=45)
# grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# 
# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)
# 
# # Make predictions on the validation set using the best model
# best_clf = grid_search.best_estimator_
# validate_pred = best_clf.predict(X_validate)
# 
# # Evaluate the classifier's performance on the validation set
# accuracy = accuracy_score(y_validate, validate_pred)
# print("Validation Accuracy:", accuracy)
# 
# # You can also print a classification report for more detailed performance metrics
# print(classification_report(y_validate, validate_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the data
final_df = pd.read_excel("final_data.xlsx")

# Extract features and target
X = final_df.drop(['result'], axis=1)
y = final_df['result']

# Replace inf and negative values in 'cur_run_rate' with zeros
X['cur_run_rate'] = X['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

# Split the data into training (70%), validation (15%), and testing (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=45)
X_validate, X_test, y_validate, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=45)

# Create and train the RandomForestClassifier with your specified hyperparameters
clf = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=2, min_samples_leaf=2, random_state=45)
clf.fit(X_train, y_train)

# Make predictions on the validation set
validate_pred = clf.predict(X_validate)

# Evaluate the classifier's performance on the validation set
validate_accuracy = accuracy_score(y_validate, validate_pred)
print("Validation Accuracy:", validate_accuracy)

# Print a classification report for the validation set
print("Validation Classification Report:")
print(classification_report(y_validate, validate_pred))

# Make predictions on the testing set
test_pred = clf.predict(X_test)

# Evaluate the classifier's performance on the testing set
test_accuracy = accuracy_score(y_test, test_pred)
print("Testing Accuracy:", test_accuracy)

# Print a classification report for the testing set
print("Testing Classification Report:")
print(classification_report(y_test, test_pred))


In [None]:
from sklearn.linear_model import LogisticRegression

# Create and train the Logistic Regression model
logistic_reg = LogisticRegression(random_state=45)
logistic_reg.fit(X_train, y_train)

# Make predictions on the validation set
validate_pred_lr = logistic_reg.predict(X_validate)

# Evaluate the classifier's performance on the validation set
validate_accuracy_lr = accuracy_score(y_validate, validate_pred_lr)
print("Logistic Regression Validation Accuracy:", validate_accuracy_lr)

# Print a classification report for the validation set
print("Logistic Regression Validation Classification Report:")
print(classification_report(y_validate, validate_pred_lr))

# Make predictions on the testing set
test_pred_lr = logistic_reg.predict(X_test)

# Evaluate the classifier's performance on the testing set
test_accuracy_lr = accuracy_score(y_test, test_pred_lr)
print("Logistic Regression Testing Accuracy:", test_accuracy_lr)

# Print a classification report for the testing set
print("Logistic Regression Testing Classification Report:")
print(classification_report(y_test, test_pred_lr))

In [None]:
from sklearn.svm import SVC

# Create and train the Support Vector Machine (SVM) model
svm_classifier = SVC(kernel='linear', random_state=45)
svm_classifier.fit(X_train, y_train)

# Make predictions on the validation set
validate_pred_svm = svm_classifier.predict(X_validate)

# Evaluate the classifier's performance on the validation set
validate_accuracy_svm = accuracy_score(y_validate, validate_pred_svm)
print("SVM Validation Accuracy:", validate_accuracy_svm)

# Print a classification report for the validation set
print("SVM Validation Classification Report:")
print(classification_report(y_validate, validate_pred_svm))

# Make predictions on the testing set
test_pred_svm = svm_classifier.predict(X_test)

# Evaluate the classifier's performance on the testing set
test_accuracy_svm = accuracy_score(y_test, test_pred_svm)
print("SVM Testing Accuracy:", test_accuracy_svm)

# Print a classification report for the testing set
print("SVM Testing Classification Report:")
print(classification_report(y_test, test_pred_svm))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Load the data
final_df = pd.read_excel("final_data.xlsx")

# Extract features and target
X = final_df.drop(['result'], axis=1)
y = final_df['result']

# Replace inf and negative values in 'cur_run_rate' with zeros
X['cur_run_rate'] = X['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

# Split the data into training (70%), validation (15%), and testing (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=45)
X_validate, X_test, y_validate, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=45)

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create and train the RandomForestClassifier with GridSearchCV
clf = RandomForestClassifier(random_state=45)
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Make predictions on the validation set using the best model
best_clf = grid_search.best_estimator_
validate_pred = best_clf.predict(X_validate)

# Calculate and print the validation accuracy
accuracy_validate = accuracy_score(y_validate, validate_pred)
print("Validation Accuracy:", accuracy_validate)

# You can also print a classification report for more detailed performance metrics
print("Validation Classification Report:")
print(classification_report(y_validate, validate_pred))

# Make predictions on the test set using the best model
test_pred = best_clf.predict(X_test)

# Calculate and print the test accuracy
accuracy_test = accuracy_score(y_test, test_pred)
print("Test Accuracy:", accuracy_test)

# You can also print a classification report for more detailed performance metrics
print("Test Classification Report:")
print(classification_report(y_test, test_pred))


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
# Calculate AUC-ROC and plot the curve
test_pred_probs = best_clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, test_pred_probs)
roc_auc = roc_auc_score(y_test, test_pred_probs)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Load the data
final_df = pd.read_excel("final_data.xlsx")

# Extract features and target
X = final_df.drop(['result'], axis=1)
y = final_df['result']

# Replace inf and negative values in 'cur_run_rate' with zeros
X['cur_run_rate'] = X['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

# Split the data into training (70%), validation (15%), and testing (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=45)
X_validate, X_test, y_validate, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=45)

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# param_grid = {
#     'n_estimators': [200],
#     'max_depth': [10],
#     'min_samples_split': [2],
#     'min_samples_leaf': [2]
# }

# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'Validation Accuracy', 'Test Accuracy'])

# Perform grid search over all parameter combinations
for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            for min_samples_leaf in param_grid['min_samples_leaf']:
                # Create and train the RandomForestClassifier
                clf12 = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                            random_state=45)
                clf12.fit(X_train, y_train)
                
                # Make predictions on the validation set
                validate_pred = clf12.predict(X_validate)
                accuracy_validate = accuracy_score(y_validate, validate_pred)
                
                # Make predictions on the test set
                test_pred = clf12.predict(X_test)
                accuracy_test = accuracy_score(y_test, test_pred)
                
                # Create a DataFrame with the current result
                current_result = pd.DataFrame({
                    'n_estimators': [n_estimators],
                    'max_depth': [max_depth],
                    'min_samples_split': [min_samples_split],
                    'min_samples_leaf': [min_samples_leaf],
                    'Validation Accuracy': [accuracy_validate],
                    'Test Accuracy': [accuracy_test]
                })
                
                # Concatenate the current result with the results DataFrame
                results_df = pd.concat([results_df, current_result], ignore_index=True)
                
                # Print the results DataFrame
                print(results_df)



  results_df = pd.concat([results_df, current_result], ignore_index=True)


  n_estimators max_depth min_samples_split min_samples_leaf  \
0          100        10                 2                1   

   Validation Accuracy  Test Accuracy  
0              0.90379       0.901211  
  n_estimators max_depth min_samples_split min_samples_leaf  \
0          100        10                 2                1   
1          100        10                 2                2   

   Validation Accuracy  Test Accuracy  
0             0.903790       0.901211  
1             0.899231       0.896653  
  n_estimators max_depth min_samples_split min_samples_leaf  \
0          100        10                 2                1   
1          100        10                 2                2   
2          100        10                 2                4   

   Validation Accuracy  Test Accuracy  
0             0.903790       0.901211  
1             0.899231       0.896653  
2             0.897734       0.898285  
  n_estimators max_depth min_samples_split min_samples_leaf  \
0      