In [42]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [43]:
def extract_seconds_from_launch(game_state_str):
    try:
        return json.loads(game_state_str).get('seconds_from_launch', 0)
    except json.JSONDecodeError:
        return 0

In [44]:
df = pd.read_csv(r'C:\Users\oguo2\GitHub\PenguinsAI\RawData\PENGUINS_20240101_to_20240131_df72162_events.tsv', delimiter='\t')

In [45]:
def session_then_launch(target_csv, output_name, output_status):
    target_csv['seconds_from_launch'] = target_csv['game_state'].apply(extract_seconds_from_launch)
    df_sorted = target_csv.sort_values(by=['session_id', 'index'])
    if output_status:
        df_sorted.to_csv(output_name, sep='\t', index=False)
    else:
        return df_sorted


In [46]:
df_sorted = session_then_launch(df,"Sorted_Jan", False)

In [47]:
df_sorted.columns

Index(['session_id', 'app_id', 'timestamp', 'event_name', 'event_data',
       'event_source', 'app_version', 'app_branch', 'log_version', 'offset',
       'user_id', 'user_data', 'game_state', 'index', 'seconds_from_launch'],
      dtype='object')

In [48]:
def check_event_starts(group, event_name):
    mismatches = []
    # Get indices of the specified event
    event_indices = group[group['event_name'] == event_name].index
    # Check each event occurrence
    for idx in event_indices:
        # If it's not the first event or if the previous 'session_id' is the same, it's a mismatch
        if idx != group.index[0] and group.at[idx, 'session_id'] == group.at[idx - 1, 'session_id']:
            mismatches.append(group.at[idx, 'session_id'])
    return mismatches

In [49]:
def find_mismatches(df, event_name):
    # Group by 'session_id' and apply the checking function for the specified event name
    mismatched_sessions = df.groupby('session_id').apply(lambda g: check_event_starts(g, event_name))

    # Flatten the list of mismatched sessions
    mismatched_sessions = [item for sublist in mismatched_sessions for item in sublist]

    # Print out the sessions with mismatches
    if mismatched_sessions:
        print(f"The total of '{len(mismatched_sessions)}'following sessions have '{event_name}' events that do not match a change in 'session_id':")
        for session in mismatched_sessions:
            print(session)
    else:
        print(f"All '{event_name}' events match a change in 'session_id'.")


In [50]:
find_mismatches(df_sorted, 'device_identifier')

All 'device_identifier' events match a change in 'session_id'.


  mismatched_sessions = df.groupby('session_id').apply(lambda g: check_event_starts(g, event_name))


In [51]:
len(df[df['index']==0])

471

In [52]:
df['event_name'].nunique()

30

In [53]:
def convert_to_lstm_input_format(df, fraction=1.0):
    # Initialize the one-hot encoder for event names
    valid_events = df['event_name'].unique()
    event_encoder = OneHotEncoder()
    event_encoder.fit(valid_events.reshape(-1, 1))

    sessions = df['session_id'].unique()
    all_sequences = []
    all_labels = []

    for session in sessions:
        session_data = df[df['session_id'] == session].sort_values('timestamp')
        
        # Determine the label for the whole session before applying the fraction cut-off
        label = 1 if 'egg_hatched' in session_data['event_name'].values or 'nest_complete' in session_data['event_name'].values else 0
        
        # Apply fraction cut-off
        limit = int(len(session_data) * fraction)
        session_data = session_data.head(limit)

        sequence = []

        for _, row in session_data.iterrows():
            # Skip the 'egg_hatched' or 'nest_complete' events
            if row['event_name'] in ['egg_hatched', 'nest_complete']:
                continue

            game_state = json.loads(row['game_state'])
            features = [
                game_state.get('posX', 0),
                game_state.get('posY', 0),
                game_state.get('posZ', 0),
                game_state.get('rotW', 0),
                game_state.get('rotX', 0),
                game_state.get('rotY', 0),
                game_state.get('rotZ', 0),
                game_state.get('seconds_from_launch', 0)
            ]
            
            event_encoded = event_encoder.transform([[row['event_name']]]).toarray()
            features.extend(event_encoded.flatten().tolist())
            sequence.append(features)
        
        if sequence:  # Add to the dataset only if there is data in the sequence
            scaler = StandardScaler()
            sequence = np.array(sequence)
            sequence[:, :8] = scaler.fit_transform(sequence[:, :8])
            all_sequences.append(sequence)
            all_labels.append(label)

    return all_sequences, all_labels


In [54]:
all_sequences_full, all_labels_full = convert_to_lstm_input_format(df=df_sorted)

In [62]:
(all_labels_full.count(1))/len(all_labels_full)

0.32696390658174096

In [83]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.model_selection import train_test_split

# Check if GPU is available and set the device accordingly
device = torch.device( 'cpu')
print('Using device:', device)

def extract_fraction_of_sequences(all_sequences, fraction):
    fraction_sequences = []
    for sequence in all_sequences:
        limit = int(len(sequence) * fraction)
        fraction_sequence = sequence[:limit]
        fraction_sequences.append(fraction_sequence)
    return fraction_sequences
def calculate_lengths(sequences_batch):
    lengths = (sequences_batch != 0).sum(dim=2) # assuming the padding is zero, and the non-feature dimension is 2
    lengths, sorted_idx = lengths.sort(descending=True)
    return lengths.cpu(), sorted_idx.cpu() # ensure indices are on the CPU for use with data
def binary_accuracy(preds, y):
    # Round predictions to the closest integer (0 or 1)
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, lengths):
        # Pack the batch
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (ht, ct) = self.lstm(packed_input)
        out = self.dropout(ht[-1])
        return self.fc(out)
    
def sequences_to_padded_tensor(sequences):
    # Padding each sequence to the length of the longest sequence in the batch
    sequences_tensor = [torch.tensor(sequence) for sequence in sequences]
    sequences_padded = pad_sequence(sequences_tensor, batch_first=True)
    return sequences_padded.float().to(device)

Using device: cpu


In [87]:
fractions = [i/10 for i in range(1, 11)]  # Fractions from 0.1 to 1.0
accuracies = []

for fraction in fractions:
    # Process the data
    sequences_fraction = extract_fraction_of_sequences(all_sequences_full, fraction)
    labels_fraction = all_labels_full[:len(sequences_fraction)]
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(sequences_fraction, labels_fraction, test_size=0.2, random_state=42)

    # Convert training and testing data to tensors
    X_train_padded = sequences_to_padded_tensor(X_train)
    X_test_padded = sequences_to_padded_tensor(X_test)
    y_train_tensor = torch.tensor(y_train, device=device).float()
    y_test_tensor = torch.tensor(y_test, device=device).float()

    print(X_test_padded.shape, X_train_padded.shape)
    # Create TensorDatasets and DataLoaders
    train_dataset = TensorDataset(X_train_padded, y_train_tensor)
    test_dataset = TensorDataset(X_test_padded, y_test_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last=True)


    # Define the model, loss function, and optimizer
    input_size = X_train_padded.size(2)  # Number of features
    hidden_size = 64  # Can be tuned
    num_layers = 1  # Can be tuned
    num_classes = 1  # Binary classification

    model = LSTMClassifier(input_size, hidden_size, num_layers, num_classes).to(device)
    criterion = nn.BCEWithLogitsLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    num_epochs = 10  # Can be tuned
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        for i, (sequences_batch, labels_batch) in enumerate(train_loader):
            print("Batch Shape:", sequences_batch.shape, labels_batch.shape) 

            lengths, sorted_idx = calculate_lengths(sequences_batch)
            print("Max sorted_idx:", sorted_idx.max().item(), "Batch Size:", sequences_batch.size(0))  # Debug print

            if sorted_idx.max() >= sequences_batch.size(0):
                print("Index out of bounds detected.")
                continue

            sequences_batch = sequences_batch[sorted_idx].to(device)
            labels_batch = labels_batch[sorted_idx].to(device)
            lengths = lengths.to(device)

            # Forward pass
            outputs = model(sequences_batch, lengths)

            # Calculate loss
            loss = criterion(outputs.view(-1), labels_batch)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % 10 == 0:
                acc = binary_accuracy(outputs.view(-1), labels_batch)
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Accuracy: {acc:.2f}')

    # Evaluation loop
    model.eval()  # Set model to evaluation mode
    test_loss = 0.0
    total_accuracy = 0.0
    with torch.no_grad():
        for sequences_batch, labels_batch in test_loader:
            # Move lengths to the same device
            lengths, sorted_idx = calculate_lengths(sequences_batch)
            sequences_batch = sequences_batch[sorted_idx].to(device)
            labels_batch = labels_batch[sorted_idx].to(device)
            lengths = lengths.to(device)

            # Forward pass
            outputs = model(sequences_batch, lengths)
            loss = criterion(outputs.view(-1), labels_batch)
            test_loss += loss.item()
            accuracy = binary_accuracy(outputs.view(-1), labels_batch)
            total_accuracy += accuracy.item()

    # Calculate average loss and accuracy
    avg_test_loss = test_loss / len(test_loader)
    avg_accuracy = total_accuracy / len(test_loader)
    print(f'Test Loss: {avg_test_loss:.4f}, Test Accuracy: {avg_accuracy:.2f}')
    accuracies.append(avg_accuracy)


torch.Size([95, 1062, 38]) torch.Size([376, 1440, 38])
Batch Shape: torch.Size([32, 1440, 38]) torch.Size([32])
Max sorted_idx: 1439 Batch Size: 32
Index out of bounds detected.
Batch Shape: torch.Size([32, 1440, 38]) torch.Size([32])
Max sorted_idx: 1439 Batch Size: 32
Index out of bounds detected.
Batch Shape: torch.Size([32, 1440, 38]) torch.Size([32])
Max sorted_idx: 1439 Batch Size: 32
Index out of bounds detected.
Batch Shape: torch.Size([32, 1440, 38]) torch.Size([32])
Max sorted_idx: 1439 Batch Size: 32
Index out of bounds detected.
Batch Shape: torch.Size([32, 1440, 38]) torch.Size([32])
Max sorted_idx: 1439 Batch Size: 32
Index out of bounds detected.
Batch Shape: torch.Size([32, 1440, 38]) torch.Size([32])
Max sorted_idx: 1439 Batch Size: 32
Index out of bounds detected.
Batch Shape: torch.Size([32, 1440, 38]) torch.Size([32])
Max sorted_idx: 1439 Batch Size: 32
Index out of bounds detected.
Batch Shape: torch.Size([32, 1440, 38]) torch.Size([32])
Max sorted_idx: 1439 Batch 

IndexError: index 32 is out of bounds for dimension 0 with size 32