In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
def extract_seconds_from_launch(game_state_str):
    try:
        return json.loads(game_state_str).get('seconds_from_launch', 0)
    except json.JSONDecodeError:
        return 0

In [3]:
df = pd.read_csv(r'C:\Users\oguo2\GitHub\PenguinsAI\RawData\PENGUINS_20240101_to_20240131_df72162_events.tsv', delimiter='\t')

In [4]:
def session_then_launch(target_csv, output_name, output_status):
    target_csv['seconds_from_launch'] = target_csv['game_state'].apply(extract_seconds_from_launch)
    df_sorted = target_csv.sort_values(by=['session_id', 'index'])
    if output_status:
        df_sorted.to_csv(output_name, sep='\t', index=False)
    else:
        return df_sorted


In [5]:
df_sorted = session_then_launch(df,"Sorted_Jan", False)

In [6]:
df.columns

Index(['session_id', 'app_id', 'timestamp', 'event_name', 'event_data',
       'event_source', 'app_version', 'app_branch', 'log_version', 'offset',
       'user_id', 'user_data', 'game_state', 'index', 'seconds_from_launch'],
      dtype='object')

In [7]:
def check_event_starts(group, event_name):
    mismatches = []
    # Get indices of the specified event
    event_indices = group[group['event_name'] == event_name].index
    # Check each event occurrence
    for idx in event_indices:
        # If it's not the first event or if the previous 'session_id' is the same, it's a mismatch
        if idx != group.index[0] and group.at[idx, 'session_id'] == group.at[idx - 1, 'session_id']:
            mismatches.append(group.at[idx, 'session_id'])
    return mismatches

In [8]:
def find_mismatches(df, event_name):
    # Group by 'session_id' and apply the checking function for the specified event name
    mismatched_sessions = df.groupby('session_id').apply(lambda g: check_event_starts(g, event_name))

    # Flatten the list of mismatched sessions
    mismatched_sessions = [item for sublist in mismatched_sessions for item in sublist]

    # Print out the sessions with mismatches
    if mismatched_sessions:
        print(f"The total of '{len(mismatched_sessions)}'following sessions have '{event_name}' events that do not match a change in 'session_id':")
        for session in mismatched_sessions:
            print(session)
    else:
        print(f"All '{event_name}' events match a change in 'session_id'.")


In [9]:
find_mismatches(df_sorted, 'device_identifier')

All 'device_identifier' events match a change in 'session_id'.


  mismatched_sessions = df.groupby('session_id').apply(lambda g: check_event_starts(g, event_name))


In [10]:
len(df[df['index']==0])

471

In [11]:
df['event_name'].nunique()

30

In [29]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Adjusted function to skip 'egg_hatched' or 'nest_complete' events in each session
def convert_to_lstm_input_format_adjusted(df):
    # Initialize the one-hot encoder for event names, excluding the ones to be removed
    valid_events = df[~df['event_name'].isin(['egg_hatched', 'nest_complete'])]['event_name'].unique()
    event_encoder = OneHotEncoder()
    event_encoder.fit(valid_events.reshape(-1, 1))

    sessions = df['session_id'].unique()
    all_sequences = []
    all_labels = []

    for session in sessions:
        session_data = df[df['session_id'] == session].sort_values('timestamp')
        
        sequence = []

        for _, row in session_data.iterrows():
            # Skip the 'egg_hatched' or 'nest_complete' events
            if row['event_name'] in ['egg_hatched', 'nest_complete']:
                continue

            game_state = json.loads(row['game_state'])
            
            features = [
                game_state.get('posX', 0),
                game_state.get('posY', 0),
                game_state.get('posZ', 0),
                game_state.get('rotW', 1),
                game_state.get('rotX', 0),
                game_state.get('rotY', 0),
                game_state.get('rotZ', 0),
                game_state.get('seconds_from_launch', 0)
            ]
            
            if row['event_name'] in valid_events:  # Check if event is in the list of valid events
                event_encoded = event_encoder.transform([[row['event_name']]]).toarray()
                features.extend(event_encoded.flatten().tolist())
                sequence.append(features)
        
        if not sequence:  # Skip sessions that end up with no data after filtering
            continue

        # Normalize/Standardize the sequence features
        scaler = StandardScaler()
        sequence = np.array(sequence)
        sequence[:, :8] = scaler.fit_transform(sequence[:, :8])

        label = 1  # Assuming default label (adjust according to your logic)
        
        all_sequences.append(sequence)
        all_labels.append(label)

    return all_sequences, all_labels


In [30]:
all_sequences, all_labels = convert_to_lstm_input_format(df=df_sorted)

In [31]:
all_labels.count(1)/len(all_labels)

0.673036093418259

In [32]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [33]:
# Assuming sequences and labels are the output from convert_to_lstm_input_format(df)
# Convert sequences to tensors and pad them
sequences_padded = pad_sequence([torch.tensor(s) for s in all_sequences], batch_first=True)
labels_tensor = torch.tensor(all_labels)

# Create a Dataset and DataLoader for batching
dataset = TensorDataset(sequences_padded, labels_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [34]:
sequences_padded.shape

torch.Size([471, 14406, 38])

In [35]:
# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, lengths):
        # Pack the batch
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (ht, ct) = self.lstm(packed_input)
        out = self.dropout(ht[-1])
        return self.fc(out)

In [36]:
# Determine the size of the inputs
input_size = sequences_padded.size(2) # Number of features
hidden_size = 64 # Can be tuned
num_layers = 1 # Can be tuned
num_classes = 1 # Binary classification

# Instantiate the model
model = LSTMClassifier(input_size, hidden_size, num_layers, num_classes)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [37]:
# Function to calculate the accuracy
def binary_accuracy(preds, y):
    # Round predictions to the closest integer (0 or 1)
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def calculate_lengths(sequences_batch):
    # Calculate lengths based on the assumption that padding value is zero
    lengths = (sequences_batch != 0).sum(dim=2)  # Sum over the feature dimension to get non-zero feature vectors
    lengths = lengths.max(dim=1).values  # Find the maximum length in the feature vectors
    # Sort the lengths in descending order and get sorting indices
    lengths, sorted_idx = lengths.sort(descending=True)
    return lengths, sorted_idx



In [38]:
# Training loop
num_epochs = 10 # Can be tuned
for epoch in range(num_epochs):
    for i, (sequences_batch, labels_batch) in enumerate(dataloader):
        sequences_batch, labels_batch = sequences_batch, labels_batch
        sequences_batch = sequences_batch.float()
        labels_batch = labels_batch.float()
        lengths, sorted_idx = calculate_lengths(sequences_batch)
    
        # You need to sort sequences_batch and labels_batch based on sorted_idx
        sequences_batch = sequences_batch[sorted_idx]
        labels_batch = labels_batch[sorted_idx]
        # Forward pass
        outputs = model(sequences_batch, lengths)
    
        loss = criterion(outputs.view(-1), labels_batch.float())
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 10 == 0:
            acc = binary_accuracy(outputs.view(-1), labels_batch.float())
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}, Accuracy: {acc:.2f}')

# You would also want to include validation and possibly early stopping in the training loop


Epoch [1/10], Step [10/15], Loss: 0.6365, Accuracy: 0.72
Epoch [2/10], Step [10/15], Loss: 0.6065, Accuracy: 0.62
Epoch [3/10], Step [10/15], Loss: 0.5355, Accuracy: 0.72
Epoch [4/10], Step [10/15], Loss: 0.3890, Accuracy: 0.88
Epoch [5/10], Step [10/15], Loss: 0.5091, Accuracy: 0.72
Epoch [6/10], Step [10/15], Loss: 0.5299, Accuracy: 0.69
Epoch [7/10], Step [10/15], Loss: 0.3798, Accuracy: 0.81
Epoch [8/10], Step [10/15], Loss: 0.4583, Accuracy: 0.81
Epoch [9/10], Step [10/15], Loss: 0.2546, Accuracy: 0.91
Epoch [10/10], Step [10/15], Loss: 0.3480, Accuracy: 0.88
