In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('D:/Raw Databases/t20i_Matches_Data.csv')

In [3]:
print(df.head())
print(df.info())
print(df.columns)

   T20I Match No  Match ID                              Match Name  Series ID  \
0             52    291356            Australia Vs India Only T20I     291355   
1             54    300436         New Zealand Vs England 2Nd T20I     300418   
2             65    361531  Netherlands Vs Scotland 2Nd Semi Final     353665   
3             66    354459     Kenya Vs Scotland 3Rd Place Playoff     353665   
4             69    361653         Sri Lanka Vs Zimbabwe 1St Match     361644   

                                         Series Name  Match Date Match Format  \
0          India tour of Australia  - 2007 (2007/08)  2008-02-01          T20   
1      England tour of New Zealand  - 2008 (2007/08)  2008-02-07          T20   
2  ICC World Twenty20 Qualifier Bermuda, Canada, ...  2008-08-04          T20   
3  ICC World Twenty20 Qualifier Bermuda, Canada, ...  2008-08-04          T20   
4              T20 Canada in Canada - 2008 (2008/09)  2008-10-10          T20   

   Team1 ID Team1 Name  Te

In [4]:
df['Match Date'] = pd.to_datetime(df['Match Date'], errors='coerce')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2592 entries, 0 to 2591
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   T20I Match No          2592 non-null   int64         
 1   Match ID               2592 non-null   int64         
 2   Match Name             2592 non-null   object        
 3   Series ID              2592 non-null   int64         
 4   Series Name            2592 non-null   object        
 5   Match Date             2581 non-null   datetime64[ns]
 6   Match Format           2592 non-null   object        
 7   Team1 ID               2592 non-null   int64         
 8   Team1 Name             2592 non-null   object        
 9   Team1 Captain          2592 non-null   int64         
 10  Team1 Runs Scored      2579 non-null   float64       
 11  Team1 Wickets Fell     2579 non-null   float64       
 12  Team1 Extras Rec       2579 non-null   float64       
 13  Tea

In [6]:
df = df.dropna(subset=['Match Date'])

In [7]:
print('Earliest Data' , df['Match Date'].min())

Earliest Data 2006-06-15 00:00:00


In [8]:
print('Latest Data' , df['Match Date'].max())

Latest Data 2024-05-07 00:00:00


In [9]:
latest_year = df['Match Date'].max().year
start_year = latest_year - 10

In [10]:
df_filtered = df[df['Match Date'].dt.year >= start_year]

In [11]:
print(f'Data filtered from {start_year} to {latest_year}')
print('Number of matches after filtering:', len(df_filtered))

Data filtered from 2014 to 2024
Number of matches after filtering: 2236


In [12]:
print('Matches with no result =',df_filtered['Match Winner'].isnull().sum())

Matches with no result = 82


In [13]:
df_filtered = df_filtered.dropna(subset=['Match Winner'])

In [14]:
print('Matches with no result =',df_filtered['Match Winner'].isnull().sum())

Matches with no result = 0


In [15]:
df_filtered.to_csv('filtered_t20_data.csv', index=False)

In [16]:
print('Team names =', df_filtered['Team1 Name'].unique()[:10])

Team names = ['India' 'New Zealand' 'West Indies' 'Australia' 'England' 'Sri Lanka'
 'Bangladesh' 'South Africa' 'Afghanistan' 'Nepal']


In [17]:
print('Venue names =',df_filtered['Match Venue (Country)'].unique()[:10])

Venue names = ['India' 'New Zealand' 'Australia' 'Bangladesh' 'West Indies'
 'South Africa' 'England' 'United Arab Emirates' 'Sri Lanka' 'Pakistan']


In [18]:
def get_home_advantage(row):
    venue_country = row['Match Venue (Country)']
    team1 = row['Team1 Name']
    team2 = row['Team2 Name']

    venue_country = str(venue_country).strip().lower()
    team1 = str(team1).strip().lower()
    team2 = str(team2).strip().lower()


    if venue_country == team1:
        return 'Team1'
    elif venue_country == team2:
        return 'Team2'
    else:
        return 'Nuetral'


In [19]:
df_filtered['Home Advantage'] = df_filtered.apply(get_home_advantage, axis=1)

In [20]:
print(df_filtered['Home Advantage'].value_counts())

Home Advantage
Nuetral    1029
Team1       572
Team2       553
Name: count, dtype: int64


In [21]:
t1_df = df_filtered[['Match Date', 'Team1 Name', 'Match Winner','Team1 Runs Scored']].copy()
t1_df.columns = ['date', 'team','winner','runs']
t1_df['won_match'] = (t1_df['team'] == t1_df['winner']).astype(int)

In [22]:
t2_df = df_filtered[['Match Date', 'Team2 Name', 'Match Winner','Team2 Runs Scored']].copy()
t2_df.columns = ['date', 'team','winner','runs']
t2_df['won_match'] = (t2_df['team'] == t2_df['winner']).astype(int)

In [23]:
full_history = pd.concat([t1_df,t2_df], axis=0)

In [24]:
full_history = full_history.sort_values(['team','date'])

In [25]:
full_history['wins_last_10'] = full_history.groupby('team')['won_match'].transform(
    lambda x: x.rolling(window=10, min_periods=1).sum().shift(1)
).fillna(0)

In [26]:
full_history['runs_last_10'] = full_history.groupby('team')['runs'].transform(
    lambda x: x.rolling(window=10, min_periods=1).mean().shift(1)
).fillna(0)

In [27]:
df_filtered = df_filtered.drop_duplicates(subset=['Match ID'])
print(f"Cleaned rows: {len(df_filtered)}")

Cleaned rows: 2154


In [28]:
full_history = full_history.drop_duplicates(subset=['date', 'team'])

In [29]:
df_filtered = pd.merge(df_filtered, full_history[['date', 'team', 'wins_last_10', 'runs_last_10']],
                       left_on=['Match Date', 'Team1 Name'], right_on=['date', 'team'], how='left')

df_filtered.rename(columns={'wins_last_10': 'Team1_Last10_Wins', 'runs_last_10': 'Team1_Last10_Runs'}, inplace=True)
df_filtered.drop(columns=['date', 'team'], inplace=True)

In [30]:
df_filtered = pd.merge(df_filtered, full_history[['date', 'team', 'wins_last_10', 'runs_last_10']],
                       left_on=['Match Date', 'Team2 Name'], right_on=['date', 'team'], how='left')

df_filtered.rename(columns={'wins_last_10': 'Team2_Last10_Wins', 'runs_last_10': 'Team2_Last10_Runs'}, inplace=True)
df_filtered.drop(columns=['date', 'team'], inplace=True)

In [31]:
print(df_filtered[['Match Date', 'Team1 Name', 'Team1_Last10_Wins','Team1_Last10_Runs', 'Team2 Name',
                   'Team2_Last10_Wins','Team2_Last10_Runs']].tail())

     Match Date  Team1 Name  Team1_Last10_Wins  Team1_Last10_Runs  Team2 Name  \
2149 2024-05-05    Thailand                6.0              109.3   Indonesia   
2150 2024-05-05    Zimbabwe                5.0              152.3  Bangladesh   
2151 2024-05-06   Indonesia                3.0              120.9    Thailand   
2152 2024-05-07       Japan                4.0              166.3    Mongolia   
2153 2024-05-07  Bangladesh                6.0              131.1    Zimbabwe   

      Team2_Last10_Wins  Team2_Last10_Runs  
2149                4.0              118.8  
2150                6.0              128.8  
2151                6.0              112.2  
2152                0.0               50.5  
2153                4.0              144.6  


In [35]:
feature_cols = [
    'Team1 Name', 'Team2 Name', 'Match Venue (Country)', 
    'Toss Winner', 'Toss Winner Choice', 'Home Advantage',
    'Team1_Last10_Wins', 'Team1_Last10_Runs',
    'Team2_Last10_Wins', 'Team2_Last10_Runs'
]
model_df['Target'] = (model_df['Match Winner'] == model_df['Team1 Name']).astype(int)


In [33]:
encoders={} #Tracking encoders
team_cols = ['Team1 Name', 'Team2 Name', 'Toss Winner']
team_le = LabelEncoder()
all_teams = pd.concat([model_df['Team1 Name'].astype(str), 
                       model_df['Team2 Name'].astype(str)]).unique()
team_le.fit(all_teams)

for col in team_cols:
    model_df[col] = team_le.transform(model_df[col].astype(str))
    encoders[col] = team_le
    
other_cols = ['Match Venue (Country)', 'Toss Winner Choice', 'Home Advantage']
for col in other_cols:
    le = LabelEncoder()
    model_df[col] = le.fit_transform(model_df[col].astype(str))
    encoders[col] = le   
        

In [36]:
X = model_df[feature_cols]
y = model_df['Target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2%}")
print("\nFeature Importance:")
importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(importances)

Model Accuracy: 64.73%

Feature Importance:
Team1_Last10_Runs        0.174565
Team2_Last10_Runs        0.166494
Team1 Name               0.113321
Team2 Name               0.110605
Toss Winner              0.103207
Team1_Last10_Wins        0.096805
Match Venue (Country)    0.095781
Team2_Last10_Wins        0.088108
Home Advantage           0.032053
Toss Winner Choice       0.019060
dtype: float64


In [38]:
import pickle

with open('cricket_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)