In [2]:
import os
import numpy as np
import pandas as pd
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn.functional as F  # Import F module
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load the data using polars
directory = r'D:\github\Cricket-prediction\data\4_filteredData'
balltoball = pl.read_csv(os.path.join(directory, 'balltoball.csv'))
teamStats = pl.read_csv(os.path.join(directory, 'team12Stats.csv'))
playersStats = pl.read_csv(os.path.join(directory, 'playersStats.csv'))

# Preprocess the data
def partition_data(df, group_keys):
    partitions = df.partition_by(group_keys)
    partition_list = [partition.drop(group_keys).to_numpy() for partition in partitions]
    return partition_list

team_stats_partitions = partition_data(teamStats, ['match_id', 'flip'])
player_stats_partitions = partition_data(playersStats, ['match_id', 'flip'])
ball_stats_partitions = partition_data(balltoball, ['match_id', 'flip'])

In [5]:
teamStats

match_id,flip,gender,Mat,Won,Lost,Tied,NR,W/L,Inns,HS,LS,AveRPW,AveRPO,Mat_team2,Won_team2,Lost_team2,Tied_team2,NR_team2,W/L_team2,Inns_team2,HS_team2,LS_team2,AveRPW_team2,AveRPO_team2
i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,f64,f64,f64,i64,i64,i64,i64,i64,f64,i64,i64,f64,f64,f64
211028,0,1,1,1,0,0,0,1.0,1,179,0.0,22.37,8.95,1,0,1,0,0,0.0,1,79,79.0,7.9,5.44
211028,1,1,1,0,1,0,0,0.0,1,79,79.0,7.9,5.44,1,1,0,0,0,1.0,1,179,0.0,22.37,8.95
211048,0,1,1,0,1,0,0,0.0,1,170,170.0,17.0,8.5,1,1,0,0,0,1.0,1,214,0.0,42.8,10.7
211048,1,1,1,1,0,0,0,1.0,1,214,0.0,42.8,10.7,1,0,1,0,0,0.0,1,170,170.0,17.0,8.5
225263,0,1,2,0,2,0,0,0.0,2,161,0.0,25.41,7.62,1,1,0,0,0,1.0,1,148,0.0,29.6,8.29
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1450753,1,1,5,5,0,0,0,5.0,5,183,119.0,21.28,7.81,5,0,5,0,0,0.0,5,57,18.0,3.56,2.67
1450759,0,1,5,0,5,0,0,0.0,5,57,18.0,3.56,2.67,10,6,4,0,0,1.5,10,176,100.0,26.9,7.11
1450759,1,1,10,6,4,0,0,1.5,10,176,100.0,26.9,7.11,5,0,5,0,0,0.0,5,57,18.0,3.56,2.67
1450765,0,1,5,5,0,0,0,5.0,5,183,119.0,21.28,7.81,10,6,4,0,0,1.5,10,176,100.0,26.9,7.11


In [20]:
# Augment the data by creating new samples with different combinations of overs
def augment_data(team_stats_list, player_stats_list, ball_stats_list, over_segments=np.arange(6, 41)):  # Min 6 overs, max 40 overs
    augmented_team_stats = []
    augmented_player_stats = []
    augmented_ball_stats = []
    
    for team_stats, player_stats, ball_stats in zip(team_stats_list, player_stats_list, ball_stats_list):
        total_overs = ball_stats.shape[0] // 6  # Assuming 6 balls per over
        for segment in over_segments:
            if total_overs >= segment:
                end_idx = segment * 6
                augmented_team_stats.append(team_stats)
                augmented_player_stats.append(player_stats)
                augmented_ball_stats.append(ball_stats[:end_idx])
    
    return augmented_team_stats, augmented_player_stats, augmented_ball_stats

augmented_team_stats, augmented_player_stats, augmented_ball_stats = augment_data(
    team_stats_partitions, player_stats_partitions, ball_stats_partitions)

# Split the dataset into training and validation sets
train_team_stats, val_team_stats, train_player_stats, val_player_stats, train_ball_stats, val_ball_stats = train_test_split(
    augmented_team_stats, augmented_player_stats, augmented_ball_stats, test_size=0.2, random_state=42)

val_team_stats, test_team_stats, val_player_stats, test_player_stats, val_ball_stats, test_ball_stats = train_test_split(
    val_team_stats, val_player_stats, val_ball_stats, test_size=0.5, random_state=42)

In [21]:

# Create a custom Dataset
class CricketDataset(Dataset):
    def __init__(self, team_stats_list, player_stats_list, ball_stats_list):
        self.team_stats_list = team_stats_list
        self.player_stats_list = player_stats_list
        self.ball_stats_list = ball_stats_list

    def __len__(self):
        return len(self.team_stats_list)

    def __getitem__(self, idx):
        team_input = torch.tensor(self.team_stats_list[idx], dtype=torch.float32)
        team_input = team_input.squeeze()  # Remove extra dimensions
        player_input = torch.tensor(self.player_stats_list[idx], dtype=torch.float32)
        ball_stats = torch.tensor(self.ball_stats_list[idx], dtype=torch.float32)
        # Assuming the last column is the label
        ball_input = ball_stats[:, :-1]
        label = ball_stats[0, -1]
        return team_input, player_input, ball_input, label

# Define a collate function to handle variable-length sequences
def collate_fn(batch):
    team_inputs = []
    player_inputs = []
    ball_inputs = []
    labels = []
    ball_lengths = []

    for team_input, player_input, ball_input, label in batch:
        team_inputs.append(team_input)
        player_inputs.append(player_input)
        ball_inputs.append(ball_input)
        labels.append(label)
        ball_lengths.append(ball_input.shape[0])

    # Pad ball_inputs to the maximum sequence length in the batch
    max_seq_len = max(ball_lengths)
    padded_ball_inputs = torch.zeros(len(ball_inputs), max_seq_len, ball_inputs[0].shape[1])
    for i, ball_input in enumerate(ball_inputs):
        seq_len = ball_input.shape[0]
        padded_ball_inputs[i, :seq_len, :] = ball_input

    team_inputs = torch.stack(team_inputs)
    player_inputs = torch.stack(player_inputs)
    labels = torch.tensor(labels, dtype=torch.float32)
    return team_inputs, player_inputs, padded_ball_inputs, labels, ball_lengths

# Create the training and validation datasets and dataloaders
train_dataset = CricketDataset(train_team_stats, train_player_stats, train_ball_stats)
val_dataset = CricketDataset(val_team_stats, val_player_stats, val_ball_stats)
test_dataset = CricketDataset(test_team_stats, test_player_stats, test_ball_stats)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [22]:
i = 0
for team_inputs, player_inputs, ball_inputs, labels, ball_lengths in train_dataloader:
    print(team_inputs.shape, player_inputs.shape, ball_inputs.shape, labels.shape)
    i += 1
    if i >= 5:
        break

torch.Size([16, 23]) torch.Size([16, 22, 22]) torch.Size([16, 234, 4]) torch.Size([16])
torch.Size([16, 23]) torch.Size([16, 22, 22]) torch.Size([16, 240, 4]) torch.Size([16])
torch.Size([16, 23]) torch.Size([16, 22, 22]) torch.Size([16, 216, 4]) torch.Size([16])
torch.Size([16, 23]) torch.Size([16, 22, 22]) torch.Size([16, 228, 4]) torch.Size([16])
torch.Size([16, 23]) torch.Size([16, 22, 22]) torch.Size([16, 234, 4]) torch.Size([16])


In [24]:
# Save dataloaders
import pickle

# Save dataloaders
with open(os.path.join(directory, '../pytorchData\\train_dataloader.pkl'), 'wb') as f:
    pickle.dump(train_dataloader, f)
with open(os.path.join(directory, '../pytorchData\\val_dataloader.pkl'), 'wb') as f:
    pickle.dump(val_dataloader, f)
with open(os.path.join(directory, '../pytorchData\\test_dataloader.pkl'), 'wb') as f:
    pickle.dump(test_dataloader, f)