In [4]:
# Importing modules
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
matches = pd.read_csv("cleaned_data2.csv")

In [5]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1928 entries, 0 to 1927
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1928 non-null   int64 
 1   innings       1928 non-null   int64 
 2   BattingTeam   1928 non-null   object
 3   TotalRuns     1928 non-null   int64 
 4   Team1         1928 non-null   object
 5   Team2         1928 non-null   object
 6   Venue         1928 non-null   object
 7   TossWinner    1928 non-null   object
 8   TossDecision  1928 non-null   object
 9   WinningTeam   1922 non-null   object
 10  Team1Players  1928 non-null   object
 11  Team2Players  1928 non-null   object
dtypes: int64(3), object(9)
memory usage: 180.9+ KB


# Handling Text Attributes

In [18]:
encoder = LabelEncoder()
team1 = matches['Team1']
print(type(team1))
team1_encoded = encoder.fit_transform(team1)
matches['Team1'] = team1_encoded
team1_encoded

<class 'pandas.core.series.Series'>


array([16, 16,  6, ..., 16, 13, 13], dtype=int64)

In [19]:
encoder = LabelEncoder()
team2 = matches['Team2']

team2_encoded = encoder.fit_transform(team2)
matches['Team2'] = team2_encoded
matches.head(1)

Unnamed: 0,ID,innings,BattingTeam,TotalRuns,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,Team1Players,Team2Players
0,335982,1,Kolkata Knight Riders,222,16,8,M Chinnaswamy Stadium,Royal Challengers Bangalore,field,Kolkata Knight Riders,"['R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis...","['SC Ganguly', 'BB McCullum', 'RT Ponting', 'D..."


# Train-Test Split

In [3]:
def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

#train_set, test_set = split_train_test(matches, 0.2)

In [5]:
#print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")

Rows in train set: 180764
Rows in test set: 45190


In [20]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(matches, test_size=0.2, random_state=42)

print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")

Rows in train set: 1542
Rows in test set: 386


In [22]:
train_set.head(1)

Unnamed: 0,ID,innings,BattingTeam,TotalRuns,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,Team1Players,Team2Players
1352,1136595,1,Royal Challengers Bangalore,127,16,0,Maharashtra Cricket Association Stadium,Chennai Super Kings,field,Chennai Super Kings,"['PA Patel', 'BB McCullum', 'V Kohli', 'AB de ...","['SR Watson', 'AT Rayudu', 'SK Raina', 'DR Sho..."


In [15]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(matches, matches["isWicketDelivery"]):
    strat_train_set = matches.loc[train_index]
    strat_test_set = matches.loc[test_index]
    
strat_test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45191 entries, 68028 to 164825
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ID                 45191 non-null  int64 
 1   innings            45191 non-null  int64 
 2   overs              45191 non-null  int64 
 3   ballnumber         45191 non-null  int64 
 4   batter             45191 non-null  object
 5   bowler             45191 non-null  object
 6   non-striker        45191 non-null  object
 7   extra_type         2388 non-null   object
 8   batsman_run        45191 non-null  int64 
 9   extras_run         45191 non-null  int64 
 10  total_run          45191 non-null  int64 
 11  non_boundary       45191 non-null  int64 
 12  isWicketDelivery   45191 non-null  int64 
 13  player_out         2230 non-null   object
 14  kind               2230 non-null   object
 15  fielders_involved  1582 non-null   object
 16  BattingTeam        45191 non-null  

In [19]:
strat_train_set["isWicketDelivery"].value_counts()

0    171842
1      8921
Name: isWicketDelivery, dtype: int64