In [1]:
## ICC Cricket World Cup 2023 Preictions

In [1]:
## Import all libraries and dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# load the data files

In [3]:
## Data for all the One Day International(ODI) teams in all the Cricket World Cups

ODI_WC_Stats = pd.read_csv(r'C:\\Users\\rohit\\OneDrive\\Desktop\\Data Analyst\\PortfolioProjects\\World Cup\\CWCStats.csv', delimiter=';')
ODI_WC_Stats.head()

Unnamed: 0,Teams,Previous_Apperances,Previous_Titles,Previous_Finals,Previous_SemiFinals,Rankings
0,India,12,2,3,7,1
1,Pakistan,12,1,2,6,2
2,Australia,12,5,6,8,3
3,South Africa,7,0,0,4,4
4,England,12,1,4,6,5


In [4]:
## Data for all the ODI's played since the 2015 ODI Cricket World Cup(CWC)

ODI_Results = pd.read_csv(r'C:\\Users\\rohit\\OneDrive\\Desktop\\Data Analyst\\PortfolioProjects\\World Cup\\Results.csv', delimiter=';')
ODI_Results.head()

Unnamed: 0,Date,Team_1,Team_2,Winner,Margin,Ground
0,17/04/2015,Bangladesh,Pakistan,Bangladesh,won by 79 runs,Shere Bangla National Stadium
1,19/04/2015,Bangladesh,Pakistan,Bangladesh,won by 7 wickets,Shere Bangla National Stadium
2,22/04/2015,Bangladesh,Pakistan,Bangladesh,won by 8 wickets,Shere Bangla National Stadium
3,08/05/2015,Ireland,England,No result,No result,The Village
4,26/05/2015,Pakistan,Zimbabwe,Pakistan,won by 41 runs,Gaddafi Stadium


In [5]:
## Considering team India as favourites, we can filter their data.

df = ODI_Results[(ODI_Results['Team_1'] == 'India') | (ODI_Results['Team_2'] == 'India')]
India = df.iloc[:]
India.head()

Unnamed: 0,Date,Team_1,Team_2,Winner,Margin,Ground
11,18/06/2015,Bangladesh,India,Bangladesh,won by 79 runs,Shere Bangla National Stadium
13,21/06/2015,Bangladesh,India,Bangladesh,won by 6 wickets,Shere Bangla National Stadium
14,24/06/2015,Bangladesh,India,India,won by 77 runs,Shere Bangla National Stadium
16,10/07/2015,Zimbabwe,India,India,won by 4 runs,Harare Sports Club
19,12/07/2015,Zimbabwe,India,India,won by 62 runs,Harare Sports Club


In [6]:
## Creating a column for matches played since 2015

India['match_year'] = India['Date'].str[-4:].astype(int)
India_2015 = India[India['match_year'] >= 2015]
count = India_2015.shape[0]

In [7]:
## Combining the data for all the teams participating in the ICC CWC 2023

worldcup_teams = ['England', 'South Africa', 'Netherlands', 'Pakistan', 'New Zealand', 'Sri Lanka', 'Afghanistan', 'Australia', 'Bangladesh', 'India']
df_teams = ODI_Results[ODI_Results['Team_1'].isin(worldcup_teams) | ODI_Results['Team_2'].isin(worldcup_teams)]
df_teams = df_teams.drop_duplicates()
count = df_teams.shape[0]
df_teams.head()

Unnamed: 0,Date,Team_1,Team_2,Winner,Margin,Ground
0,17/04/2015,Bangladesh,Pakistan,Bangladesh,won by 79 runs,Shere Bangla National Stadium
1,19/04/2015,Bangladesh,Pakistan,Bangladesh,won by 7 wickets,Shere Bangla National Stadium
2,22/04/2015,Bangladesh,Pakistan,Bangladesh,won by 8 wickets,Shere Bangla National Stadium
3,08/05/2015,Ireland,England,No result,No result,The Village
7,09/06/2015,England,New Zealand,England,won by 210 runs,Edgbaston


In [8]:
## Droping the tables that we would not need

df_teams_2015 = df_teams.drop(['Date','Margin', 'Ground'], axis=1)
df_teams_2015.head()

Unnamed: 0,Team_1,Team_2,Winner
0,Bangladesh,Pakistan,Bangladesh
1,Bangladesh,Pakistan,Bangladesh
2,Bangladesh,Pakistan,Bangladesh
3,Ireland,England,No result
7,England,New Zealand,England


In [9]:
## Building a model

In [10]:
## Reseting the index and creating a column.
# If team 1 is winner, it will display 1 and if team 2 is winner, it will display 2.

df_teams_2015 = df_teams_2015.reset_index(drop=True)
df_teams_2015.loc[df_teams_2015.Winner == df_teams_2015.Team_1,'winning_team']=1
df_teams_2015.loc[df_teams_2015.Winner == df_teams_2015.Team_2, 'winning_team']=2
df_teams_2015 = df_teams_2015.drop(['winning_team'], axis=1)

df_teams_2015.head()

Unnamed: 0,Team_1,Team_2,Winner
0,Bangladesh,Pakistan,Bangladesh
1,Bangladesh,Pakistan,Bangladesh
2,Bangladesh,Pakistan,Bangladesh
3,Ireland,England,No result
4,England,New Zealand,England


In [11]:
## Converting the variables using one-hot encoding

final = pd.get_dummies(df_teams_2015, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

In [12]:
## Separating the sets and dividing to test and train.

X = final.drop(['Winner'], axis=1)
y = final["Winner"]
final = final[final["Winner"] != "No result"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

final.head()

Unnamed: 0,Winner,Team_1_Afghanistan,Team_1_Afghanistan.1,Team_1_Australia,Team_1_Australia.1,Team_1_Bangladesh,Team_1_Bangladesh.1,Team_1_England,Team_1_England.1,Team_1_Hong Kong,...,Team_2_Netherlands,Team_2_New Zealand,Team_2_Oman,Team_2_Pakistan,Team_2_Scotland,Team_2_South Africa,Team_2_Sri Lanka,Team_2_United Arab Emirates,Team_2_West Indies,Team_2_Zimbabwe
0,Bangladesh,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,Bangladesh,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Bangladesh,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,England,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
5,New Zealand,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [13]:
## Creating a Logistic Regression model & training it on the training data to evaluate its accuracy.

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
score = logreg.score(X_train, y_train)
score2 = logreg.score(X_test, y_test)
print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.645
Test set accuracy:  0.518


In [14]:
## Adding the ICC Rankings to get favourites and ICC CWC 2023 fixtures.

ODI_Rankings = pd.read_csv(r'C:\\Users\\rohit\\OneDrive\\Desktop\\Data Analyst\\PortfolioProjects\\World Cup\\Ranking.csv',delimiter=',')

CWC_Fixtures = pd.read_csv(r'C:\\Users\\rohit\\OneDrive\\Desktop\\Data Analyst\\PortfolioProjects\\World Cup\\fixtures.csv',delimiter=';')

In [15]:
## Creating a list for storing the group stage games

pred_set = []

In [16]:
## Create new columns with ranking position of each team

CWC_Fixtures['first_position'] = CWC_Fixtures['Team_1'].map(ODI_Rankings.set_index('Team')['Pos'])
CWC_Fixtures['second_position'] = CWC_Fixtures['Team_2'].map(ODI_Rankings.set_index('Team')['Pos'])

# to create a new, use the following code 
#CWC_Fixtures.insert(1, 'first_position', CWC_Fixtures['Team_1'].map(ODI_Rankings.set_index('Team')['Pos']))
#CWC_Fixtures.insert(2, 'second_position', CWC_Fixtures['Team_2'].map(ODI_Rankings.set_index('Team')['Pos']))

In [17]:
# We only need the group stage games, so slice the dataset and display the last 5 rows

CWC_Fixtures = CWC_Fixtures.iloc[:45, :]
CWC_Fixtures

Unnamed: 0,Round,Date,City,Team_1,Team_2,Group,Winner,first_position,second_position
0,1,05/10/2023,Ahmedabad,New Zealand,England,Group A,New Zealand,6.0,5.0
1,1,06/10/2023,Hyderabad,Netherlands,Pakistan,Group A,Pakistan,10.0,2.0
2,1,07/10/2023,Dharamshala,Afghanistan,Bangladesh,Group A,Bangladesh,9.0,7.0
3,1,07/10/2023,Delhi,Sri Lanka,South Africa,Group A,South Africa,8.0,4.0
4,1,08/10/2023,Chennai,Australia,India,Group A,India,3.0,1.0
5,1,09/10/2023,Hyderabad,Netherlands,New Zealand,Group A,New Zealand,10.0,6.0
6,1,10/10/2023,Dharamshala,Bangladesh,England,Group A,England,7.0,5.0
7,1,10/10/2023,Hyderabad,Sri Lanka,Pakistan,Group A,Pakistan,8.0,2.0
8,1,11/10/2023,Delhi,Afghanistan,India,Group A,India,9.0,1.0
9,1,12/10/2023,Lucknow,South Africa,Australia,Group A,South Africa,4.0,3.0


In [18]:
# Loop to add teams to new prediction dataset as per thier rankings.
pred_set = []

for _, row in CWC_Fixtures.iterrows():
    team_1_name = row['Team_1']
    team_2_name = row['Team_2']
    
    # Get the rankings of Team_1 and Team_2 from the ODI_Rankings DataFrame
    team_1_rank = ODI_Rankings.loc[ODI_Rankings['Team'] == team_1_name, 'Pos'].values[0]
    team_2_rank = ODI_Rankings.loc[ODI_Rankings['Team'] == team_2_name, 'Pos'].values[0]
    
    if team_1_rank < team_2_rank:
        stronger_team, weaker_team = team_1_name, team_2_name
    else:
        stronger_team, weaker_team = team_2_name, team_1_name
    
    # Add the matchup to the prediction dataset with 'winning_team' initially set to None
    pred_set.append({'Team_1': stronger_team, 'Team_2': weaker_team, 'winning_team': None})

pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set
pred_set.head()

Unnamed: 0,Team_1,Team_2,winning_team
0,England,New Zealand,
1,Pakistan,Netherlands,
2,Bangladesh,Afghanistan,
3,South Africa,Sri Lanka,
4,India,Australia,


In [19]:
## Getting the dummy variables

pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

In [20]:
## Adding missing columns to model's training dataset

missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0

# Remove and reordering the database.
pred_set = pred_set[final.columns]
pred_set = pred_set.drop(['Winner'], axis=1)
pred_set.head()

Unnamed: 0,Team_1_Afghanistan,Team_1_Afghanistan.1,Team_1_Australia,Team_1_Australia.1,Team_1_Bangladesh,Team_1_Bangladesh.1,Team_1_England,Team_1_England.1,Team_1_Hong Kong,Team_1_Hong Kong.1,...,Team_2_Netherlands,Team_2_New Zealand,Team_2_Oman,Team_2_Pakistan,Team_2_Scotland,Team_2_South Africa,Team_2_Sri Lanka,Team_2_United Arab Emirates,Team_2_West Indies,Team_2_Zimbabwe
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Get the results of league matches 

consistent_features = set(final.columns).intersection(set(pred_set.columns))

# Make a prediction dataset 'pred_set' with consistent features
pred_set = pred_set[final.columns.drop('Winner')]

# Predict using the logistic regression model
predictions = logreg.predict(pred_set)

# Loop through the fixtures and display predictions
for i in range(CWC_Fixtures.shape[0]):
    team_1 = backup_pred_set.iloc[i, 0]
    team_2 = backup_pred_set.iloc[i, 1]

    if predictions[i] == 1:
        print(f"{team_1} and {team_2}")
        print(f"Winner: {team_1}")
    else:
        print(f"{team_1} and {team_2}")
        print(f"Winner: {team_2}")
    print("")

England and New Zealand
Winner: New Zealand

Pakistan and Netherlands
Winner: Netherlands

Bangladesh and Afghanistan
Winner: Afghanistan

South Africa and Sri Lanka
Winner: Sri Lanka

India and Australia
Winner: Australia

New Zealand and Netherlands
Winner: Netherlands

England and Bangladesh
Winner: Bangladesh

Pakistan and Sri Lanka
Winner: Sri Lanka

India and Afghanistan
Winner: Afghanistan

Australia and South Africa
Winner: South Africa

New Zealand and Bangladesh
Winner: Bangladesh

India and Pakistan
Winner: Pakistan

England and Afghanistan
Winner: Afghanistan

Australia and Sri Lanka
Winner: Sri Lanka

South Africa and Netherlands
Winner: Netherlands

New Zealand and Afghanistan
Winner: Afghanistan

India and Bangladesh
Winner: Bangladesh

Pakistan and Australia
Winner: Australia

Sri Lanka and Netherlands
Winner: Netherlands

South Africa and England
Winner: England

India and New Zealand
Winner: New Zealand

Pakistan and Afghanistan
Winner: Afghanistan

South Africa and B