In [22]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(),".."))

import model_utils

import polars as pl
# import data
def load_data():
    balltoball = pl.read_csv(os.path.join(os.path.join( '..',"data", "filtered_data" , "balltoball.csv")))
    team_stats = pl.read_csv(os.path.join(os.path.join( '..',"data", "filtered_data" , "team12_stats.csv")))
    players_stats = pl.read_csv(os.path.join(os.path.join( '..',"data", "filtered_data" , "players_stats.csv")))
    return balltoball, team_stats, players_stats
balltoball,team_stats,players_stats = load_data()
print(balltoball.columns)
print(team_stats.columns)
print(players_stats.columns)
print(balltoball.head(1),team_stats.head(1),players_stats.head(1))

['match_id', 'flip', 'innings', 'ball', 'runs', 'wickets', 'overs', 'run_rate', 'curr_score', 'curr_wickets', 'target', 'won']
['match_id', 'flip', 'gender', 'Cumulative Won team1', 'Cumulative Lost team1', 'Cumulative Tied team1', 'Cumulative W/L team1', 'Cumulative AveRPW team1', 'Cumulative AveRPO team1', 'Cumulative Won team2', 'Cumulative Lost team2', 'Cumulative Tied team2', 'Cumulative W/L team2', 'Cumulative AveRPW team2', 'Cumulative AveRPO team2']
['match_id', 'flip', 'Cum Mat Total', 'Cum Runs Total', 'Cum SR', 'Cumulative Overs', 'Cumulative Bowling Runs', 'Cumulative Wkts', 'Cumulative Econ', 'Cumulative Dis', 'Cumulative Ct', 'Cumulative St', 'Cumulative D/I']
shape: (1, 12)
┌──────────┬──────┬─────────┬──────┬───┬────────────┬──────────────┬────────┬─────┐
│ match_id ┆ flip ┆ innings ┆ ball ┆ … ┆ curr_score ┆ curr_wickets ┆ target ┆ won │
│ ---      ┆ ---  ┆ ---     ┆ ---  ┆   ┆ ---        ┆ ---          ┆ ---    ┆ --- │
│ i64      ┆ i64  ┆ i64     ┆ f64  ┆   ┆ i64      

In [23]:
def partition_data_with_keys(df, group_keys):
    partitions = df.partition_by(group_keys)
    keys = [tuple(partition.select(group_keys).unique().to_numpy()[0]) for partition in partitions]
    partitions = [partition.drop(group_keys).to_numpy() for partition in partitions]
    # partitions = [partition for partition in partitions]                  # for testing
    return keys, partitions

# Use the updated partition_data_with_keys function
balltoball_keys, balltoball_partitions = partition_data_with_keys(balltoball, ["match_id", "flip"])
team_stats_keys, team_stats_partitions = partition_data_with_keys(team_stats, ["match_id", "flip"])
players_stats_keys, players_stats_partitions = partition_data_with_keys(players_stats, ["match_id", "flip"])

# Align the partitions using common keys
common_keys = set(balltoball_keys) & set(team_stats_keys) & set(players_stats_keys)

balltoball_dict = dict(zip(balltoball_keys, balltoball_partitions))
team_stats_dict = dict(zip(team_stats_keys, team_stats_partitions))
players_stats_dict = dict(zip(players_stats_keys, players_stats_partitions))

aligned_balltoball_partitions = []
aligned_team_stats_partitions = []
aligned_players_stats_partitions = []
labels = []

for key in common_keys:
    balltoball_partition = balltoball_dict[key]
    team_stats_partition = team_stats_dict[key]
    players_stats_partition = players_stats_dict[key]

    label = balltoball_partition[:, -1][0]
    aligned_balltoball_partitions.append(balltoball_partition[:, :-1])
    aligned_team_stats_partitions.append(team_stats_partition)
    aligned_players_stats_partitions.append(players_stats_partition)
    labels.append(label)

import numpy as np
labels = np.array(labels)

In [24]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset

team_data = [team.to_numpy() if isinstance(team, pl.DataFrame) else team for team in aligned_team_stats_partitions]
player_data = [players.to_numpy() if isinstance(players, pl.DataFrame) else players for players in aligned_players_stats_partitions]
ball_data = [ball.to_numpy() if isinstance(ball, pl.DataFrame) else ball for ball in aligned_balltoball_partitions]

train_indices, val_indices = train_test_split(np.arange(len(labels)), test_size=0.2, random_state=42)
val_indices, test_indices = train_test_split(val_indices, test_size=0.5, random_state=42)

dataset = model_utils.CricketDataset(
    team_data,
    player_data,
    ball_data,
    labels
)

train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
test_dataset = Subset(dataset, test_indices)


train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=model_utils.collate_fn_with_padding)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True, collate_fn=model_utils.collate_fn_with_padding)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=model_utils.collate_fn_with_padding)

for team_input, player_input, ball_input, labels, mask in train_dataloader:
    print(f"Team input shape: {team_input.shape}")  # [batch_size, team_feature_dim]
    print(f"Player input shape: {player_input.shape}")  # [batch_size, player_feature_dim]
    print(f"Padded ball input shape: {ball_input.shape}")  # [batch_size, max_seq_len, ball_feature_dim]
    print(f"Mask shape: {mask.shape}")  # [batch_size, max_seq_len]
    print(f"Labels shape: {labels.shape}")  # [batch_size]
    break

Team input shape: torch.Size([32, 13])
Player input shape: torch.Size([32, 22, 11])
Padded ball input shape: torch.Size([32, 255, 9])
Mask shape: torch.Size([32, 255])
Labels shape: torch.Size([32])


In [25]:
import pickle
with open(os.path.join( '..',"data", "pytorch_data" , "train_dataloader.pkl"), "wb") as f:
    pickle.dump(train_dataloader, f)

with open(os.path.join( '..',"data", "pytorch_data" , "val_dataloader.pkl"), "wb") as f:
    pickle.dump(val_dataloader, f)

with open(os.path.join( '..',"data", "pytorch_data" , "test_dataloader.pkl"), "wb") as f:
    pickle.dump(test_dataloader, f)

In [26]:
train_dataloader = pickle.load(open(os.path.join( '..',"data", "pytorch_data" , "train_dataloader.pkl"), "rb"))
val_dataloader = pickle.load(open(os.path.join( '..',"data", "pytorch_data" , "val_dataloader.pkl"), "rb"))
test_dataloader = pickle.load(open(os.path.join( '..',"data", "pytorch_data" , "test_dataloader.pkl"), "rb"))

In [27]:
j=0
for i in train_dataloader:
    j+=1
print(j)

57


In [28]:
for i in train_dataloader:
    j = np.random.randint(32)
    data0 = np.array(i[0][j]).reshape(1,-1)
    data1 = np.array(i[1][j])
    data2 = np.array(i[2][j])
    print(data0.shape, data1.shape, data2.shape)
    # team0 = pl.DataFrame(data0,schema=team_stats.columns)
    # players1 = pl.DataFrame(data1,schema=players_stats.columns)
    # balltoball2 = pl.DataFrame(data2,schema=balltoball.columns[:-1])
    team0 = pl.DataFrame(data0,schema=team_stats.columns[2:])
    players1 = pl.DataFrame(data1,schema=players_stats.columns[2:])
    balltoball2 = pl.DataFrame(data2,schema=balltoball.columns[2:-1])
    break
team0

(1, 13) (22, 11) (265, 9)


gender,Cumulative Won team1,Cumulative Lost team1,Cumulative Tied team1,Cumulative W/L team1,Cumulative AveRPW team1,Cumulative AveRPO team1,Cumulative Won team2,Cumulative Lost team2,Cumulative Tied team2,Cumulative W/L team2,Cumulative AveRPW team2,Cumulative AveRPO team2
f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0.0,9.0,21.0,0.0,0.43,19.049999,7.29,32.0,21.0,1.0,1.52,24.360001,7.67


In [29]:
players1

Cum Mat Total,Cum Runs Total,Cum SR,Cumulative Overs,Cumulative Bowling Runs,Cumulative Wkts,Cumulative Econ,Cumulative Dis,Cumulative Ct,Cumulative St,Cumulative D/I
f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
6.0,145.0,108.709999,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.5
50.0,1203.0,113.529999,31.6,231.0,5.0,7.78,27.0,25.0,2.0,0.54
18.0,211.0,83.5,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.11
45.0,1178.0,118.410004,0.0,0.0,0.0,0.0,38.0,20.0,18.0,0.84
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…
8.0,56.0,88.459999,16.0,175.0,4.0,10.83,3.0,3.0,0.0,0.38
4.0,19.0,103.459999,16.0,130.0,3.0,8.13,1.0,1.0,0.0,0.25
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20.0,196.0,129.020004,74.300003,612.0,19.0,8.21,3.0,3.0,0.0,0.15


In [30]:
balltoball2

innings,ball,runs,wickets,overs,run_rate,curr_score,curr_wickets,target
f32,f32,f32,f32,f32,f32,f32,f32,f32
1.0,0.1,2.0,0.0,0.0,0.0,2.0,0.0,0.0
1.0,0.2,4.0,0.0,0.0,0.0,6.0,0.0,0.0
1.0,0.3,1.0,0.0,0.0,0.0,7.0,0.0,0.0
1.0,0.4,0.0,0.0,0.0,0.0,7.0,0.0,0.0
1.0,0.5,0.0,0.0,0.0,0.0,7.0,0.0,0.0
…,…,…,…,…,…,…,…,…
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
