In [1]:
import os

# Get the current working directory
current_directory = os.getcwd()

print("Current Directory:", current_directory)


Current Directory: /content


In [2]:
import os
import pandas as pd
import re  # For regular expressions

# Specify the folder containing CSV files
folder_path = '/content/sample_data'

# Columns to delete
columns_to_delete = ['key_id', 'key_id.1', 'operation', 'pX', 'pY', 'LR', 'state', 'delta','key']

# Dictionary to store DataFrames for each user
user_data = {}

# Helper function to extract numeric part of the filename
def extract_numeric(file_name):
    match = re.match(r"(\d+)", os.path.splitext(file_name)[0])  # Match numeric prefix
    return int(match.group(1)) if match else float('inf')  # Return inf for non-numeric names

# Get and sort file names numerically
file_names = sorted(
    [file_name for file_name in os.listdir(folder_path) if file_name.endswith('.csv')],
    key=extract_numeric
)
# Loop through sorted files
for file_name in file_names:
    # Extract numeric prefix as user key
    user_key = extract_numeric(file_name)

    # Skip files that don't have a numeric prefix
    if user_key == float('inf'):
        continue
# Load the CSV into a DataFrame
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path)
# Drop the specified columns
    df = df.drop(columns=columns_to_delete, errors='ignore')

    # Add the user_id column
    df['user_id'] = user_key


    # Store the modified DataFrame in the dictionary
    user_data[user_key] = df

# Display the first few rows of the updated DataFrames in numeric order
for user in sorted(user_data.keys()):  # Ensure numeric order in output
    print(f"Updated Data for User {user}:")
    print(user_data[user].head())  # Display the first few rows
    print("-" * 50)

Updated Data for User 1:
   time_diff  time_since_beginning  press_to_press  release_to_press  \
0      0.140                 0.281           0.187             0.140   
1      1.217                 1.685           0.156             1.217   
2      0.546                 2.387           0.172             0.546   
3      0.577                 3.136           0.156             0.577   
4      0.094                 3.386           0.234             0.094   

   hold_time  user_id  
0      0.281        1  
1      1.404        1  
2      0.702        1  
3      0.749        1  
4      0.250        1  
--------------------------------------------------
Updated Data for User 2:
   time_diff  time_since_beginning  press_to_press  release_to_press  \
0      0.172                 0.172           0.358            -0.124   
1      0.110                 0.406           0.358            -0.124   
2      0.125                 0.655           0.297            -0.109   
3      0.124                 1.076

In [3]:
# Display the number of missing values for each user
print("Summary of Missing Values for Each User:")
print("-" * 50)
for user, df in user_data.items():
    # Calculate the total number of missing values in the DataFrame
    total_missing = df.isnull().sum().sum()
    print(f"User {user} has {total_missing} missing values.")
print("-" * 50)

Summary of Missing Values for Each User:
--------------------------------------------------
User 1 has 0 missing values.
User 2 has 0 missing values.
User 3 has 0 missing values.
User 4 has 0 missing values.
User 5 has 0 missing values.
User 6 has 0 missing values.
User 7 has 0 missing values.
User 8 has 0 missing values.
User 9 has 0 missing values.
User 10 has 0 missing values.
User 11 has 0 missing values.
User 12 has 0 missing values.
User 13 has 0 missing values.
User 14 has 0 missing values.
User 15 has 0 missing values.
User 16 has 0 missing values.
User 18 has 0 missing values.
User 19 has 0 missing values.
User 20 has 0 missing values.
User 21 has 0 missing values.
User 22 has 0 missing values.
User 23 has 0 missing values.
User 24 has 0 missing values.
User 25 has 0 missing values.
User 26 has 0 missing values.
User 27 has 0 missing values.
User 28 has 0 missing values.
User 29 has 0 missing values.
User 30 has 0 missing values.
User 31 has 0 missing values.
User 32 has 0 mis

In [4]:
# Handle missing values: Replace all missing values with zero
for user, df in user_data.items():
    # Replace all missing values with zero
    df.fillna(0, inplace=True)

    # Store the updated DataFrame in the dictionary
    user_data[user] = df

    print(f"Missing values handled for User {user}. Updated DataFrame:")
    print(user_data[user].head())  # Display first few rows after handling missing values
    print("-" * 50)


# Display the number of missing values after handling
print("Summary of Missing Values for Each User (After Handling):")
print("-" * 50)
for user, df in user_data.items():
    # Calculate the total number of missing values in the DataFrame
    total_missing = df.isnull().sum().sum()
    print(f"User {user} has {total_missing} missing values.")
print("-" * 50)

Missing values handled for User 1. Updated DataFrame:
   time_diff  time_since_beginning  press_to_press  release_to_press  \
0      0.140                 0.281           0.187             0.140   
1      1.217                 1.685           0.156             1.217   
2      0.546                 2.387           0.172             0.546   
3      0.577                 3.136           0.156             0.577   
4      0.094                 3.386           0.234             0.094   

   hold_time  user_id  
0      0.281        1  
1      1.404        1  
2      0.702        1  
3      0.749        1  
4      0.250        1  
--------------------------------------------------
Missing values handled for User 2. Updated DataFrame:
   time_diff  time_since_beginning  press_to_press  release_to_press  \
0      0.172                 0.172           0.358            -0.124   
1      0.110                 0.406           0.358            -0.124   
2      0.125                 0.655           0.2

In [5]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Define a MinMaxScaler instance
scaler = MinMaxScaler()

# Normalize the data for each user
for user, df in user_data.items():
    # Identify columns to exclude
    excluded_columns = ['key', 'user_id']
    columns_to_normalize = [col for col in df.columns if col not in excluded_columns]

    # Replace inf and -inf with NaN, then fill NaN with zero
    df[columns_to_normalize] = df[columns_to_normalize].replace([np.inf, -np.inf], np.nan).fillna(0)

    # Clip extremely large values to a reasonable range (e.g., based on domain knowledge)
    df[columns_to_normalize] = df[columns_to_normalize].clip(lower=-1e6, upper=1e6)

    # Apply MinMaxScaler only to the selected columns
    df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

    # Update the DataFrame in the dictionary
    user_data[user] = df

    print(f"Normalization applied for User {user}. Updated DataFrame:")
    print(user_data[user].head())  # Display first few rows after normalization
    print("-" * 50)
    # Confirm the normalization
print("Normalization Summary:")
print("-" * 50)
for user, df in user_data.items():
    print(f"User {user}:")
    print(df.describe().transpose()[['min', 'max']])  # Check min and max values
    print("-" * 50)



Normalization applied for User 1. Updated DataFrame:
   time_diff  time_since_beginning  press_to_press  release_to_press  \
0   0.011699              0.000000        0.291322          0.043138   
1   0.145737              0.003075        0.227273          0.172912   
2   0.062228              0.004613        0.260331          0.092059   
3   0.066086              0.006253        0.227273          0.095795   
4   0.005974              0.006801        0.388430          0.037595   

   hold_time  user_id  
0   0.028736        1  
1   0.166646        1  
2   0.080437        1  
3   0.086209        1  
4   0.024929        1  
--------------------------------------------------
Normalization applied for User 2. Updated DataFrame:
   time_diff  time_since_beginning  press_to_press  release_to_press  \
0   0.034807              0.000000        0.475610          0.063044   
1   0.017680              0.000647        0.475610          0.063044   
2   0.021823              0.001335        0.382622

In [6]:
print("Available user IDs:", list(user_data.keys()))


Available user IDs: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117]


In [None]:

import pandas as pd
import numpy as np

# Runtime input for processing and corrupted user IDs
processing_user_id = input("Enter the processing user_id: ").strip()
corrupted_user_ids = input("Enter the list of corrupted user_ids (comma-separated): ").strip().split(',')

# Convert IDs to integers (or strings, depending on user_data keys)
try:
    processing_user_id = int(processing_user_id)  # Adjust if user_data keys are strings
    corrupted_user_ids = [int(uid) for uid in corrupted_user_ids]
except ValueError:
    raise ValueError("Invalid input: User IDs should be numeric.")

# Debug: Print available user IDs
print("Available user IDs:", list(user_data.keys()))

# Check if processing_user_id exists
if processing_user_id not in user_data:
    raise KeyError(f"Processing user ID {processing_user_id} not found in user_data.")

# Check if all corrupted_user_ids exist
missing_ids = [uid for uid in corrupted_user_ids if uid not in user_data]
if missing_ids:
    raise KeyError(f"Corrupted user IDs not found in user_data: {missing_ids}")

# Hyperparameters
N0 = 50  # Number of events per observation
Nh = 10  # Hop size

# Function to create observations for a user with a simple observation ID
def create_observations(df, user_id, file_prefix):
    observations = []
    obs_counter = 1  # Counter to generate simple observation IDs
    for start in range(0, len(df) - N0 + 1, Nh):
        obs = df.iloc[start:start + N0].copy()  # No sorting, directly take rows in order
        obs_id = f"{user_id}_obs_{obs_counter}"  # Simple observation ID
        obs['obs_id'] = obs_id  # Assign simple ID to the observation
        observations.append(obs)
        obs_counter += 1  # Increment counter for the next observation

    # Combine all observations
    observations_df = pd.concat(observations, ignore_index=True)
    observations_df['user_id'] = user_id  # Add user ID for reference

    # Save observations to a CSV file
    filename = f"{file_prefix}_observations_{user_id}.csv"
    observations_df.to_csv(filename, index=False)
    print(f"Saved observations for User {user_id} to {filename}")
    return observations_df

# Function to calculate summary features (ensuring obs_id is included and in order)
def calculate_summary_features(observations_df):
    # Ensure the summary is calculated in the order of obs_id as in the original observations_df
    summary = observations_df.groupby('obs_id').agg(
        latency=('time_since_beginning', lambda x: x.iloc[-1] - x.iloc[0]),
        typing_speed=('obs_id', 'count'),
        std_time_diff=('time_diff', 'std')
    ).reset_index()

    # Reorder summary to match the order of obs_id in the original observations
    summary = summary.set_index('obs_id').reindex(observations_df['obs_id'].unique()).reset_index()

    # Calculate typing speed
    summary['typing_speed'] /= summary['latency']  # Calculate typing speed
    return summary

# Process observations for the processing user ID
processing_df = user_data[processing_user_id]
processing_observations = create_observations(processing_df, processing_user_id, "processing")

# Process observations for corrupted user IDs
corrupted_observations = []
for user_id in corrupted_user_ids:
    user_df = user_data[user_id]
    corrupted_obs = create_observations(user_df, user_id, "corrupted")
    corrupted_observations.append(corrupted_obs)

# Combine all corrupted observations into a single DataFrame
corrupted_observations_df = pd.concat(corrupted_observations, ignore_index=True)

# Calculate summary features for both processing and corrupted users
processing_summary = calculate_summary_features(processing_observations)
corrupted_summary = calculate_summary_features(corrupted_observations_df)

# Save summary features to CSV files
processing_summary.to_csv("processing_summary_features.csv", index=False)
corrupted_summary.to_csv("corrupted_summary_features.csv", index=False)
print("Saved summary features for processing and corrupted users.")

# Remove 'time_since_beginning' column from observation files
processing_observations = processing_observations.drop(columns=['time_since_beginning'])
corrupted_observations_df = corrupted_observations_df.drop(columns=['time_since_beginning'])





Enter the processing user_id: 5
Enter the list of corrupted user_ids (comma-separated): 12,114,115,78,50,45,30,20,80,100
Available user IDs: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117]
Saved observations for User 5 to processing_observations_5.csv
Saved observations for User 12 to corrupted_observations_12.csv
Saved observations for User 114 to corrupted_observations_114.csv
Saved observations for User 115 to corrupted_observations_115.csv
Saved observations for User 78 to corrupted_observations_78.csv
Saved observations for User 50 to corrupted_observations_50.cs

In [None]:
# Inspect column names in the processing observations file
print("Columns in the processing observations file:")
print(processing_observations.columns.tolist())

# Inspect column names in the corrupted observations file
print("\nColumns in the corrupted observations file:")
print(corrupted_observations_df.columns.tolist())


Columns in the processing observations file:
['time_diff', 'press_to_press', 'release_to_press', 'hold_time', 'user_id', 'obs_id']

Columns in the corrupted observations file:
['time_diff', 'press_to_press', 'release_to_press', 'hold_time', 'user_id', 'obs_id']


In [None]:
# Count the number of unique user IDs and unique observations in processing_observations
processing_user_counts = processing_observations.groupby('user_id').agg(
    num_unique_observations=('obs_id', 'nunique')
).reset_index()

# Count the number of unique user IDs and unique observations in corrupted_observations
corrupted_user_counts = corrupted_observations_df.groupby('user_id').agg(
    num_unique_observations=('obs_id', 'nunique')
).reset_index()

# Print results
print("Number of unique observations per user in processing_observations:")
print(processing_user_counts)

print("\nNumber of unique observations per user in corrupted_observations:")
print(corrupted_user_counts)


Number of unique observations per user in processing_observations:
   user_id  num_unique_observations
0        5                        1

Number of unique observations per user in corrupted_observations:
   user_id  num_unique_observations
0       12                        1
1       20                        1
2       30                        1
3       50                        1
4      114                        1


In [None]:
# To get the observation IDs for each user in the processing observations
print("Observation IDs for each user in processing observations:")
for user_id in processing_observations['user_id'].unique():
    user_observations = processing_observations[processing_observations['user_id'] == user_id]
    print(f"User {user_id}:")
    print(user_observations['obs_id'].tolist())

# To get the observation IDs for each user in the corrupted observations
print("Observation IDs for each user in corrupted observations:")
for user_id in corrupted_observations_df['user_id'].unique():
    user_observations = corrupted_observations_df[corrupted_observations_df['user_id'] == user_id]
    print(f"User {user_id}:")
    print(user_observations['obs_id'].tolist())


Observation IDs for each user in processing observations:
User 5:
[np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5), np.int64(5

In [None]:
# Assuming processing_observations is already created and contains the necessary data

# Get the first two unique observation IDs
unique_obs_ids = processing_observations['obs_id'].unique()[:2]

# Filter the data based on these unique observation IDs and print them
for obs_id in unique_obs_ids:
    observation = processing_observations[processing_observations['obs_id'] == obs_id]
    print(f"Observation ID: {obs_id}")
    print(observation)
    print("-" * 50)  # Separator for clarity


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3051   0.027706    1.459108e-01          0.109937   0.025000        5      5
3052   0.014719    1.459108e-01          0.098661   0.061719        5      5
3053   0.013853    8.828996e-02          0.109232   0.073437        5      5
3054   0.013853    1.486989e-02          0.197322   0.122656        5      5
3055   0.027706    5.947955e-02          0.208598   0.073437        5      5
3056   0.054545    8.736059e-02          0.230444   0.135156        5      5
3057   0.014719    1.486989e-02          0.198027   0.122656        5      5
3058   0.041558    7.249071e-02          0.219873   0.085937        5      5
3059   0.014719    1.486989e-02          0.198027   0.110156        5      5
3060   0.095238    7.342007e-02          0.263566   0.134375        5      5
3061   0.027706    5.855019e-02          0.208598   0.122656        5      5
3062   0.027706    1.310409e-01          0.208598   0.110156        5      5
3063   0.06

In [None]:
import pandas as pd

# Load the summary feature files
processing_summary = pd.read_csv("processing_summary_features.csv")
corrupted_summary = pd.read_csv("corrupted_summary_features.csv")

# Display the entire content of both files
# Set display options to show the entire DataFrame
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust width to prevent truncation
pd.set_option('display.max_colwidth', None)  # No truncation of column content

# Display the processing summary
print("Processing Summary Features:")
print(processing_summary)

# Display the corrupted summary
print("\nCorrupted Summary Features:")
print(corrupted_summary)


Processing Summary Features:
        obs_id   latency  typing_speed  std_time_diff
0      5_obs_1  0.028464   1756.600547       0.070211
1      5_obs_2  0.027204   1837.956242       0.067192
2      5_obs_3  0.026381   1895.313653       0.066493
3      5_obs_4  0.024818   2014.686765       0.037311
4      5_obs_5  0.024732   2021.710526       0.036255
5      5_obs_6  0.023602   2118.436248       0.034041
6      5_obs_7  0.025471   1962.988644       0.079958
7      5_obs_8  0.024990   2000.784641       0.080345
8      5_obs_9  0.024515   2039.601770       0.081464
9     5_obs_10  0.024253   2061.588303       0.081578
10    5_obs_11  0.023778   2102.824892       0.082651
11    5_obs_12  0.021391   2337.413860       0.029795
12    5_obs_13  0.026467   1889.139344       0.101221
13    5_obs_14  0.026987   1852.731114       0.118678
14    5_obs_15  0.031585   1583.044206       0.132106
15    5_obs_16  0.033106   1510.295724       0.130251
16    5_obs_17  0.033932   1473.528689       0.130277

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Function to normalize summary features
def normalize_summary_features(summary_df, file_prefix):
    # Select columns to normalize
    columns_to_normalize = ['latency', 'typing_speed', 'std_time_diff']

    # Initialize Min-Max Scaler
    scaler = MinMaxScaler()

    # Apply Min-Max normalization to the selected columns
    summary_df[columns_to_normalize] = scaler.fit_transform(summary_df[columns_to_normalize])

    # Save the normalized summary to a CSV file
    filename = f"{file_prefix}_summary_features_normalized.csv"
    summary_df.to_csv(filename, index=False)
    print(f"Saved normalized summary features to {filename}")
    return summary_df

# Normalize summary features for processing and corrupted files
processing_summary_normalized = normalize_summary_features(processing_summary, "processing")
corrupted_summary_normalized = normalize_summary_features(corrupted_summary, "corrupted")


Saved normalized summary features to processing_summary_features_normalized.csv
Saved normalized summary features to corrupted_summary_features_normalized.csv


In [None]:
# Function to count missing values in the DataFrame
def count_missing_values(df, filename):
    missing_values = df.isnull().sum()
    print(f"Missing values in {filename}:")
    print(missing_values)

# Count missing values in the normalized summary DataFrames
count_missing_values(processing_summary_normalized, "processing_summary_normalized")
count_missing_values(corrupted_summary_normalized, "corrupted_summary_normalized")


Missing values in processing_summary_normalized:
obs_id           0
latency          0
typing_speed     0
std_time_diff    0
dtype: int64
Missing values in corrupted_summary_normalized:
obs_id           0
latency          0
typing_speed     0
std_time_diff    0
dtype: int64


In [None]:
import pandas as pd

# Load the summary feature files
processing_summary_normalized = pd.read_csv("processing_summary_features_normalized.csv")
corrupted_summary_normalized = pd.read_csv("corrupted_summary_features_normalized.csv")

# Display the entire content of both files
# Set display options to show the entire DataFrame
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust width to prevent truncation
pd.set_option('display.max_colwidth', None)  # No truncation of column content

# Display the processing summary
print("Processing Summary Features:")
print(processing_summary_normalized)

# Display the corrupted summary
print("\nCorrupted Summary Features:")
print(corrupted_summary_normalized)

Processing Summary Features:
        obs_id   latency  typing_speed  std_time_diff
0      5_obs_1  0.054485      0.670194       0.401638
1      5_obs_2  0.046203      0.707376       0.377681
2      5_obs_3  0.040791      0.733590       0.372135
3      5_obs_4  0.030515      0.788148       0.140565
4      5_obs_5  0.029948      0.791358       0.132183
5      5_obs_6  0.022525      0.835565       0.114616
6      5_obs_7  0.034812      0.764520       0.478984
7      5_obs_8  0.031649      0.781794       0.482060
8      5_obs_9  0.028522      0.799535       0.490938
9     5_obs_10  0.026804      0.809583       0.491846
10    5_obs_11  0.023677      0.828430       0.500361
11    5_obs_12  0.007990      0.935644       0.080916
12    5_obs_13  0.041357      0.730769       0.647721
13    5_obs_14  0.044776      0.714129       0.786250
14    5_obs_15  0.074999      0.590873       0.892807
15    5_obs_16  0.085000      0.557625       0.878092
16    5_obs_17  0.090430      0.540821       0.878297

In [None]:
# Split observations into train and test sets
def split_observations(observations_df, train_ratio=0.7):
    # Calculate the split index
    split_index = int(len(observations_df) * train_ratio)

    # Split into train and test sets based on natural order of occurrence
    train_obs = observations_df.iloc[:split_index]
    test_obs = observations_df.iloc[split_index:]

    # Save the train and test observations to separate files
    train_filename = "processing_train_obs.csv"
    test_filename = "processing_test_obs.csv"
    train_obs.to_csv(train_filename, index=False)
    test_obs.to_csv(test_filename, index=False)

    print(f"Saved training observations to {train_filename} and testing observations to {test_filename}")
    return train_obs, test_obs

# Perform the split on the processing observations
processing_train_obs, processing_test_obs = split_observations(processing_observations)


Saved training observations to processing_train_obs.csv and testing observations to processing_test_obs.csv


In [None]:
# Function to display the count of unique observation IDs
def display_unique_obs_count(filename):
    # Load the CSV file
    df = pd.read_csv(filename)

    # Get the count of unique observation IDs
    unique_obs_count = df['obs_id'].nunique()
    print(f"Unique observation IDs in {filename}: {unique_obs_count}")
    return unique_obs_count

# Display the count of unique observation IDs in both train and test files
display_unique_obs_count("processing_train_obs.csv")
display_unique_obs_count("processing_test_obs.csv")


Unique observation IDs in processing_train_obs.csv: 113
Unique observation IDs in processing_test_obs.csv: 49


49

In [None]:
# Save corrupted observations to a CSV file if not already saved
corrupted_observations_df.to_csv("corrupted_observations.csv", index=False)


In [None]:
import pandas as pd
import numpy as np

# Load the processing train observations and corrupted observations
processing_train_obs = pd.read_csv("processing_train_obs.csv")
corrupted_observations_df = pd.read_csv("corrupted_observations.csv")

# Corruption probability (50%)
corrupt_probability = 0.9

# Get the list of unique observation IDs in both datasets
processing_obs_ids = processing_train_obs['obs_id'].unique()
corrupted_obs_ids = corrupted_observations_df['obs_id'].unique()

# Randomly select observations to replace based on corruption probability
replace_indices = np.random.choice(len(processing_obs_ids), size=int(len(processing_obs_ids) * corrupt_probability), replace=False)

# Initialize a list to store the final observations
final_observations = []

# Track replaced observation IDs and their corresponding corrupted observation IDs
replaced_obs_ids = []
corrupted_obs_used = []

# Iterate over the processing observations and replace if selected
for i, obs_id in enumerate(processing_obs_ids):
    # Check if the observation is to be replaced
    if i in replace_indices:
        # Select a random corrupted observation with the same observation ID
        corrupted_obs = corrupted_observations_df[corrupted_observations_df['obs_id'] == np.random.choice(corrupted_obs_ids)]
        final_observations.append(corrupted_obs)

        # Track replaced observation ID and corrupted observation ID
        replaced_obs_ids.append(obs_id)
        corrupted_obs_used.append(corrupted_obs['obs_id'].iloc[0])  # Assuming all rows in corrupted_obs have the same ID
    else:
        # Keep the original observation
        original_obs = processing_train_obs[processing_train_obs['obs_id'] == obs_id]
        final_observations.append(original_obs)

# Concatenate the final observations
final_observations_df = pd.concat(final_observations, ignore_index=True)

# Save the final observations to a CSV file
final_observations_df.to_csv("final_obs.csv", index=False)

# Display the number of unique observation IDs that were replaced
replaced_obs_df = pd.DataFrame({
    'Replaced Observation ID': replaced_obs_ids,
    'Corrupted Observation ID': corrupted_obs_used
})

# Display the count of replaced observation IDs
replaced_obs_count = replaced_obs_df['Replaced Observation ID'].nunique()
print(f"Number of unique observation IDs replaced: {replaced_obs_count}")

# Display the table with replaced observation IDs and their corresponding corrupted IDs
print("\nTable of replaced observations:")
print(replaced_obs_df)



Number of unique observation IDs replaced: 101

Table of replaced observations:
    Replaced Observation ID Corrupted Observation ID
0                   5_obs_1               12_obs_189
1                   5_obs_2               114_obs_52
2                   5_obs_3              114_obs_127
3                   5_obs_4                20_obs_25
4                   5_obs_5                30_obs_50
5                   5_obs_6               100_obs_76
6                   5_obs_7               80_obs_159
7                   5_obs_8              100_obs_115
8                   5_obs_9              100_obs_123
9                  5_obs_10               20_obs_140
10                 5_obs_11               12_obs_181
11                 5_obs_12                20_obs_39
12                 5_obs_14                12_obs_34
13                 5_obs_16                78_obs_93
14                 5_obs_17               50_obs_106
15                 5_obs_18                45_obs_81
16                 

In [None]:
# Load the final observations file
final_obs_df = pd.read_csv("final_obs.csv")

# Check the columns to ensure we have 'obs_id' and 'user_id'
print(final_obs_df.columns)

# Group by 'obs_id' and 'user_id' to get the unique observation IDs and their respective corrupted user IDs
obs_user_count = final_obs_df.groupby(['obs_id', 'user_id']).size().reset_index(name='count')

# Display the result
print("Count of unique observation IDs and their respective corrupted user IDs:")
print(obs_user_count)

# To check how many observations from corrupted users are present
corrupted_obs_count = obs_user_count[obs_user_count['user_id'].isin(corrupted_user_ids)]

# Print the count of corrupted observations
print(f"\nCount of corrupted observations in the final dataset:")
print(corrupted_obs_count)


Index(['time_diff', 'press_to_press', 'release_to_press', 'hold_time', 'user_id', 'obs_id'], dtype='object')
Count of unique observation IDs and their respective corrupted user IDs:
          obs_id  user_id  count
0    100_obs_115      100     50
1    100_obs_122      100     50
2    100_obs_123      100     50
3     100_obs_13      100     50
4    100_obs_145      100     50
5    100_obs_148      100     50
6     100_obs_76      100     50
7     100_obs_99      100     50
8     114_obs_11      114     50
9    114_obs_127      114     50
10   114_obs_135      114     50
11    114_obs_43      114     50
12    114_obs_52      114     50
13     114_obs_6      114     50
14    114_obs_65      114     50
15    114_obs_80      114     50
16   115_obs_121      115     50
17   115_obs_136      115    100
18   115_obs_141      115     50
19    115_obs_33      115     50
20    115_obs_53      115     50
21    115_obs_67      115     50
22    115_obs_86      115     50
23    12_obs_100       12 

In [None]:
# Check if the 'user_id' column exists in the dataset
if 'user_id' in final_obs_df.columns:
    print("The 'user_id' column exists in the dataset.")
else:
    print("The 'user_id' column does not exist in the dataset.")


The 'user_id' column exists in the dataset.


In [None]:
import pandas as pd

# Load the final observations
final_observations_df = pd.read_csv("final_obs.csv")

# Sort the DataFrame based on the 'obs_id' to maintain the original order
sorted_final_obs_df = final_observations_df.sort_values(by='obs_id').reset_index(drop=True)

# Print the order of each observation with its index
print("Order of observations in final_obs.csv:")
for index, row in sorted_final_obs_df.iterrows():
    print(f"Index: {index}, Observation ID: {row['obs_id']}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Index: 650, Observation ID: 114_obs_6
Index: 651, Observation ID: 114_obs_6
Index: 652, Observation ID: 114_obs_6
Index: 653, Observation ID: 114_obs_6
Index: 654, Observation ID: 114_obs_6
Index: 655, Observation ID: 114_obs_6
Index: 656, Observation ID: 114_obs_6
Index: 657, Observation ID: 114_obs_6
Index: 658, Observation ID: 114_obs_6
Index: 659, Observation ID: 114_obs_6
Index: 660, Observation ID: 114_obs_6
Index: 661, Observation ID: 114_obs_6
Index: 662, Observation ID: 114_obs_6
Index: 663, Observation ID: 114_obs_6
Index: 664, Observation ID: 114_obs_6
Index: 665, Observation ID: 114_obs_6
Index: 666, Observation ID: 114_obs_6
Index: 667, Observation ID: 114_obs_6
Index: 668, Observation ID: 114_obs_6
Index: 669, Observation ID: 114_obs_6
Index: 670, Observation ID: 114_obs_6
Index: 671, Observation ID: 114_obs_6
Index: 672, Observation ID: 114_obs_6
Index: 673, Observation ID: 114_obs_6
Index: 674, Observation

In [None]:
# Load the final observations file
final_obs_df = pd.read_csv("final_obs.csv")

# Check the columns to ensure we have 'obs_id' and 'user_id'
print(final_obs_df.columns)

# Get the total number of unique observation IDs
total_unique_obs_ids = final_obs_df['obs_id'].nunique()
print(f"Total unique observation IDs in final observations: {total_unique_obs_ids}")

# Get the unique observation IDs corresponding to the processing user
processing_obs_ids = final_obs_df[final_obs_df['user_id'] == processing_user_id]['obs_id'].nunique()
print(f"Unique observation IDs from processing user in final observations: {processing_obs_ids}")

# Get the unique observation IDs corresponding to corrupted users
corrupted_obs_ids = final_obs_df[final_obs_df['user_id'].isin(corrupted_user_ids)]['obs_id'].nunique()
print(f"Unique observation IDs from corrupted users in final observations: {corrupted_obs_ids}")


Index(['time_diff', 'press_to_press', 'release_to_press', 'hold_time', 'user_id', 'obs_id'], dtype='object')
Total unique observation IDs in final observations: 110
Unique observation IDs from processing user in final observations: 12
Unique observation IDs from corrupted users in final observations: 98


In [None]:
!pip install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers

# Load the final observations from the CSV file
final_observations_df = pd.read_csv("final_obs.csv")

# Group by observation ID to ensure we preserve the order of events
grouped = final_observations_df.groupby('obs_id')

# Initialize a list to store 2D vectors (M x N) for each observation
observation_vectors = []

# Convert each observation's events into a 2D vector (M x N)
for obs_id, group in grouped:
    # The events should already be in order as they were preserved from the final_obs file.
    # We don't need sorting by 'time_since_beginning' because the order is maintained.

    # Extract event features (exclude non-numeric columns like 'obs_id', 'user_id')
    events = group.drop(columns=['obs_id', 'user_id']).values  # Adjust based on your actual columns

    # M is the number of events, and N is the number of features per event
    observation_vectors.append(events)

# Now observation_vectors is a list of M x N matrices (one for each observation)
# Convert the list of 2D vectors into a 2D array
observation_vectors_combined = np.concatenate(observation_vectors, axis=0)

# Scale the data to standardize it
scaler = StandardScaler()
observation_vectors_scaled = scaler.fit_transform(observation_vectors_combined)

# Define Autoencoder Model
input_dim = observation_vectors_scaled.shape[1]  # Number of features
encoding_dim = 5  # Reduced number of features

# Input layer
input_layer = Input(shape=(input_dim,))
# Encoder layer
encoded = Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l2(0.01))(input_layer)
# Decoder layer (reconstruction part, not necessary for final result but required for autoencoder structure)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

# Autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoded)

# Encoder model to get the encoded features
encoder = Model(inputs=input_layer, outputs=encoded)

# Compile and train the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(observation_vectors_scaled, observation_vectors_scaled, epochs=50, batch_size=256, shuffle=True)

# Get the reduced features (5 components) from the encoder
encoded_features = encoder.predict(observation_vectors_scaled)

# Create a DataFrame for the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=[f'Feature{i+1}' for i in range(encoding_dim)])

# Add the corresponding observation IDs and user IDs to the encoded DataFrame
observation_ids = [obs_id for obs_id, group in grouped for _ in range(len(group))]
user_ids = final_observations_df['user_id'].values  # Assuming 'user_id' exists in the original data
encoded_df['obs_id'] = observation_ids
encoded_df['user_id'] = user_ids

# Reorder the columns so that 'obs_id' and 'user_id' are the first columns
encoded_df = encoded_df[['obs_id', 'user_id'] + [f'Feature{i+1}' for i in range(encoding_dim)]]

# Display the entire encoded data as a table
print("\nComplete Autoencoded Data with Observation and User IDs:")
print(encoded_df)

# Save the encoded results to a CSV file
encoded_df.to_csv("final_obs_pca_7_components_with_obs_id.csv", index=False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
650     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
651     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
652     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
653     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
654     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
655     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
656     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
657     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
658     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
659     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
660     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
661     114_obs_6       12  0.000000  0.000000  0.000000  0.000000  0.000000
662     114

In [None]:
# Load the PCA results CSV file
pca_result_df = pd.read_csv('final_obs_pca_7_components_with_obs_id.csv')

# Check if the 'user_id' column exists in the new PCA DataFrame
if 'user_id' in pca_result_df.columns:
    print("The 'user_id' column exists in the PCA results file.")
else:
    print("The 'user_id' column does not exist in the PCA results file.")


The 'user_id' column exists in the PCA results file.


In [None]:

# Load the summary feature files
processing_summary_normalized = pd.read_csv("processing_summary_features_normalized.csv")
corrupted_summary_normalized = pd.read_csv("corrupted_summary_features_normalized.csv")

# Print the columns of both dataframes
print("Processing Summary Columns:", processing_summary_normalized.columns)
print("Corrupted Summary Columns:", corrupted_summary_normalized.columns)


Processing Summary Columns: Index(['obs_id', 'latency', 'typing_speed', 'std_time_diff'], dtype='object')
Corrupted Summary Columns: Index(['obs_id', 'latency', 'typing_speed', 'std_time_diff'], dtype='object')


In [None]:
import pandas as pd

# Load the PCA data with observation IDs
final_obs_pca_df = pd.read_csv("final_obs_pca_7_components_with_obs_id.csv")

# Load the summary feature files
processing_summary_normalized = pd.read_csv("processing_summary_features_normalized.csv")
corrupted_summary_normalized = pd.read_csv("corrupted_summary_features_normalized.csv")

# Ensure the 'obs_id' is the index or column in both summary dataframes for easy lookup
# If 'obs_id' is not the index, we set it explicitly
processing_summary_normalized.set_index('obs_id', inplace=True)
corrupted_summary_normalized.set_index('obs_id', inplace=True)

# Initialize an empty list to store the summary features for each observation ID
final_data_with_summary = []

# Iterate over each observation ID in the PCA DataFrame
for _, row in final_obs_pca_df.iterrows():
    obs_id = row['obs_id']

    # Check if the observation ID exists in the processing summary or corrupted summary
    if obs_id in processing_summary_normalized.index:
        # Get the corresponding summary features from the processing summary
        summary_features = processing_summary_normalized.loc[obs_id]
    elif obs_id in corrupted_summary_normalized.index:
        # Get the corresponding summary features from the corrupted summary
        summary_features = corrupted_summary_normalized.loc[obs_id]
    else:
        # If no match is found, fill with NaNs (or handle it as appropriate)
        summary_features = pd.Series([np.nan] * processing_summary_normalized.shape[1], index=processing_summary_normalized.columns)

    # Combine the PCA components with the summary features
    combined_row = pd.concat([row, summary_features])

    # Append the combined row to the final data list
    final_data_with_summary.append(combined_row)

# Convert the list of combined rows back into a DataFrame
final_data_with_summary_df = pd.DataFrame(final_data_with_summary)

# Save the final DataFrame with PCA and summary features to a CSV
final_data_with_summary_df.to_csv("final_obs_with_summary_features.csv", index=False)

# Display the updated DataFrame
print("Updated DataFrame with PCA and Summary Features:")
print(final_data_with_summary_df)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
650       0.295391       0.434185  
651       0.295391       0.434185  
652       0.295391       0.434185  
653       0.295391       0.434185  
654       0.295391       0.434185  
655       0.295391       0.434185  
656       0.295391       0.434185  
657       0.295391       0.434185  
658       0.295391       0.434185  
659       0.295391       0.434185  
660       0.295391       0.434185  
661       0.295391       0.434185  
662       0.295391       0.434185  
663       0.295391       0.434185  
664       0.295391       0.434185  
665       0.295391       0.434185  
666       0.295391       0.434185  
667       0.295391       0.434185  
668       0.295391       0.434185  
669       0.295391       0.434185  
670       0.295391       0.434185  
671       0.295391       0.434185  
672       0.295391       0.434185  
673       0.295391       0.434185  
674       0.295391       0.434185  
675       0.295391       0.434185  

In [None]:
pip install scikit-learn




In [None]:
import sklearn
print(sklearn.__version__)


1.6.1


In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("final_obs_with_summary_features.csv")

# Define the number of episodes
num_episodes = 30
episode_length = len(df) // num_episodes

# Create episodes with unique episode IDs
episodes = []
episode_id = 1

for i in range(0, len(df), episode_length):
    episode_data = df.iloc[i:i+episode_length].copy()
    episode_data['episode_id'] = episode_id
    episodes.append(episode_data)
    episode_id += 1

# Combine all episodes back into a single DataFrame
final_episodes = pd.concat(episodes)

# Save the episodes to a CSV file
final_episodes.to_csv("final_episodes.csv", index=False)

print("Episodes created successfully!")


Episodes created successfully!


In [None]:
pip install gym tensorflow scikit-learn


Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/721.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/721.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/721.7 kB[0m [31m13.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gym_notices>=0.0.4 (from gym)
  Downloading gym_notices-0.0.8-py3-none-any.whl.metadata (1.0 kB)
Downloading gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml) ... [?

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import random

# Function to calculate summary features
def calculate_summary_features(observations_df):
    summary = observations_df.groupby('obs_id').agg(
        latency=('time_since_beginning', lambda x: x.iloc[-1] - x.iloc[0]),
        typing_speed=('obs_id', 'count'),
        std_time_diff=('time_diff', 'std')
    ).reset_index()

    summary['typing_speed'] /= summary['latency']  # Calculate typing speed
    return summary

# Function to normalize summary features using MinMaxScaler
def normalize_summary_features(summary_df):
    scaler = MinMaxScaler()
    summary_df[['latency', 'typing_speed', 'std_time_diff']] = scaler.fit_transform(
        summary_df[['latency', 'typing_speed', 'std_time_diff']]
    )
    return summary_df

# Load the processing test observations
processing_test_obs = pd.read_csv('processing_test_obs.csv')

# Get user input for corrupted user IDs
corrupted_user_ids_input = input("Enter the list of corrupted user_ids (comma-separated): ").strip().split(',')
corrupted_user_ids = [int(uid) for uid in corrupted_user_ids_input]  # Convert to integers

# Check if user IDs are present
if 'user_id' not in processing_test_obs.columns:
    raise KeyError("user_id column is missing in the processing_test_obs.csv file.")

# Create corrupted test observations for the given corrupted user IDs
corrupted_observations = []
for user_id in corrupted_user_ids:
    user_df = user_data.get(user_id)  # Assuming user_data is a pre-defined dictionary
    if user_df is None:
        raise KeyError(f"User ID {user_id} not found in user_data.")

    corrupted_obs = create_observations(user_df, user_id, "corrupted")
    corrupted_observations.append(corrupted_obs)

# Combine all corrupted observations into a single DataFrame
corrupted_observations_df = pd.concat(corrupted_observations, ignore_index=True)

# Calculate summary features for the corrupted observations
corrupted_summary = calculate_summary_features(corrupted_observations_df)

# Normalize the summary features
corrupted_summary_normalized = normalize_summary_features(corrupted_summary)

# Save the normalized summary features to a CSV file
corrupted_summary_normalized.to_csv("corrupted_test_summary.csv", index=False)
print("Saved normalized summary features for corrupted test observations.")

# Remove 'time_since_beginning' from the corrupted test observations DataFrame
corrupted_observations_df = corrupted_observations_df.drop(columns=['time_since_beginning'])

# Save corrupted test observations (after dropping the column)
corrupted_observations_df.to_csv("corrupted_test_observations.csv", index=False)
print("Saved corrupted test observations after removing 'time_since_beginning' column.")

# Corrupt the processing test observations with a 20% probability
corruption_probability = 0.8
final_test_observations = processing_test_obs.copy()

# Replace observations with corrupted test observations based on the corruption probability
for i, row in final_test_observations.iterrows():
    if random.random() < corruption_probability:  # Corrupt with 20% probability
        corrupted_obs_idx = random.randint(0, len(corrupted_observations_df) - 1)
        corrupted_row = corrupted_observations_df.iloc[corrupted_obs_idx]
        final_test_observations.iloc[i] = corrupted_row

# Save the final test observations with corruptions
final_test_observations.to_csv("final_test_observations.csv", index=False)
print("Saved final test observations with corrupted data.")



Enter the list of corrupted user_ids (comma-separated): 12,114,50,30,20
Saved observations for User 12 to corrupted_observations_12.csv
Saved observations for User 114 to corrupted_observations_114.csv
Saved observations for User 50 to corrupted_observations_50.csv
Saved observations for User 30 to corrupted_observations_30.csv
Saved observations for User 20 to corrupted_observations_20.csv
Saved normalized summary features for corrupted test observations.
Saved corrupted test observations after removing 'time_since_beginning' column.
Saved final test observations with corrupted data.


In [None]:
import pandas as pd

# Load the final_test_observations.csv file
final_test_observations_df = pd.read_csv('final_test_observations.csv')

# Display the column names
print(final_test_observations_df.columns.tolist())


['time_diff', 'press_to_press', 'release_to_press', 'hold_time', 'user_id', 'obs_id']


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers

# Load the final test observations file
final_observations_df = pd.read_csv('final_test_observations.csv')

# Group by observation ID to ensure we preserve the order of events
grouped = final_observations_df.groupby('obs_id')

# Initialize a list to store 2D vectors (M x N) for each observation
observation_vectors = []

# Convert each observation's events into a 2D vector (M x N)
for obs_id, group in grouped:
    # Extract event features (exclude non-numeric columns like 'obs_id')
    events = group.drop(columns=['obs_id']).values  # Keep 'user_id' for final results

    # M is the number of events, and N is the number of features per event
    observation_vectors.append(events)

# Now observation_vectors is a list of M x N matrices (one for each observation)
# Convert the list of 2D vectors into a 2D array
observation_vectors_combined = np.concatenate(observation_vectors, axis=0)

# Scale the data to standardize it
scaler = StandardScaler()
observation_vectors_scaled = scaler.fit_transform(observation_vectors_combined)

# Define Autoencoder Model
input_dim = observation_vectors_scaled.shape[1]  # Number of features
encoding_dim = 5  # Reduced number of features (autoencoder output)

# Input layer
input_layer = Input(shape=(input_dim,))
# Encoder layer
encoded = Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l2(0.01))(input_layer)
# Decoder layer (not used in the final output but necessary for autoencoder structure)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

# Autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoded)

# Encoder model to get the encoded features
encoder = Model(inputs=input_layer, outputs=encoded)

# Compile and train the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(observation_vectors_scaled, observation_vectors_scaled, epochs=50, batch_size=256, shuffle=True)

# Get the reduced features (5 components) from the encoder
encoded_features = encoder.predict(observation_vectors_scaled)

# Create a DataFrame for the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=[f'Feature{i+1}' for i in range(encoding_dim)])

# Add the corresponding observation IDs and user IDs to the encoded DataFrame
observation_ids = [obs_id for obs_id, group in grouped for _ in range(len(group))]
user_ids = [user_id for user_id, group in grouped for user_id in group['user_id']]

# Add the user_id and obs_id columns
encoded_df['obs_id'] = observation_ids
encoded_df['user_id'] = user_ids

# Reorder the columns so that 'obs_id' and 'user_id' are the first columns
encoded_df = encoded_df[['obs_id', 'user_id'] + [f'Feature{i+1}' for i in range(encoding_dim)]]

# Save the autoencoder results to a CSV file
encoded_df.to_csv('final_test_pca_results.csv', index=False)

print("Autoencoder analysis completed and saved to 'final_test_pca_results.csv'.")


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 8.9251  
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 8.5674 
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 7.7912 
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 7.3831 
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 7.0911 
Epoch 6/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6.2657 
Epoch 7/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6.4000 
Epoch 8/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 5.9608 
Epoch 9/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6.1030 
Epoch 10/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 5.7378

In [None]:
import pandas as pd

# Load the final_test_observations.csv file
final_test_observations_df = pd.read_csv('final_test_pca_results.csv')

# Display the column names
print(final_test_observations_df.columns.tolist())


['obs_id', 'user_id', 'Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5']


In [None]:
import pandas as pd
import numpy as np

# Load the PCA data with observation IDs
final_test_pca_df = pd.read_csv("final_test_pca_results.csv")

# Load the summary feature files
processing_summary_normalized = pd.read_csv("processing_summary_features_normalized.csv")
corrupted_summary_normalized = pd.read_csv("corrupted_test_summary.csv")

# Ensure the 'obs_id' is the index or column in both summary dataframes for easy lookup
# If 'obs_id' is not the index, we set it explicitly
processing_summary_normalized.set_index('obs_id', inplace=True)
corrupted_summary_normalized.set_index('obs_id', inplace=True)

# Initialize an empty list to store the summary features for each observation ID
final_data_with_summary = []

# Iterate over each observation ID in the PCA DataFrame
for _, row in final_test_pca_df.iterrows():
    obs_id = row['obs_id']

    # Check if the observation ID exists in the processing summary or corrupted summary
    if obs_id in processing_summary_normalized.index:
        # Get the corresponding summary features from the processing summary
        summary_features = processing_summary_normalized.loc[obs_id]
    elif obs_id in corrupted_summary_normalized.index:
        # Get the corresponding summary features from the corrupted summary
        summary_features = corrupted_summary_normalized.loc[obs_id]
    else:
        # If no match is found, fill with NaNs (or handle it as appropriate)
        summary_features = pd.Series([np.nan] * processing_summary_normalized.shape[1], index=processing_summary_normalized.columns)

    # Combine the PCA components with the summary features
    combined_row = pd.concat([row.drop('obs_id'), summary_features])  # Dropping 'obs_id' before concatenation

    # Append the combined row to the final data list
    final_data_with_summary.append(combined_row)

# Convert the list of combined rows back into a DataFrame
final_data_with_summary_df = pd.DataFrame(final_data_with_summary)

# Add 'obs_id' back to the DataFrame
final_data_with_summary_df['obs_id'] = final_test_pca_df['obs_id']

# Reorder the columns so that 'obs_id' is the first column
final_data_with_summary_df = final_data_with_summary_df[['obs_id'] + [col for col in final_data_with_summary_df.columns if col != 'obs_id']]

# Save the final DataFrame with PCA and summary features to a CSV
final_data_with_summary_df.to_csv("final_test_with_summary_features.csv", index=False)

# Display the updated DataFrame
print("Updated DataFrame with PCA and Summary Features:")
print(final_data_with_summary_df)


Updated DataFrame with PCA and Summary Features:
           obs_id  user_id  Feature1  Feature2  Feature3  Feature4  Feature5   latency  \
0      114_obs_10      114  0.100609  0.000000  0.000000  0.000000  0.090414  0.901165   
1      114_obs_10      114  0.127255  0.000000  0.000000  0.000000  0.012663  0.901165   
2      114_obs_10      114  0.015271  0.020762  0.000000  0.000000  0.295411  0.901165   
3      114_obs_10      114  0.000000  0.111557  0.000000  0.000000  0.460619  0.901165   
4      114_obs_10      114  0.000000  0.178535  0.000000  0.000000  1.195371  0.901165   
5      114_obs_10      114  0.000000  0.050224  0.000000  0.000000  0.431161  0.901165   
6      114_obs_10      114  0.000000  0.295696  0.000000  0.000000  1.036061  0.901165   
7      114_obs_10      114  0.000000  0.387625  0.000000  0.000000  2.148521  0.901165   
8     114_obs_100      114  0.079782  0.000257  0.000000  0.000000  0.176746  0.250732   
9     114_obs_100      114  0.000000  0.081803  0.0

In [None]:
import pandas as pd
import numpy as np

# Load the final DataFrame with PCA and summary features
final_data_with_summary_df = pd.read_csv("final_test_with_summary_features.csv")

# Calculate the total number of observations
total_observations = final_data_with_summary_df.shape[0]

# Determine the number of observations per episode
observations_per_episode = total_observations // 20  # Divide into 20 episodes

# Initialize an empty list to store the final episode data
final_episode_data = []

# Create a new 'episode_id' column to assign episode IDs
for i in range(20):
    # Get the start and end index for the current episode
    start_index = i * observations_per_episode
    if i == 19:  # Ensure the last episode includes all remaining observations
        end_index = total_observations
    else:
        end_index = (i + 1) * observations_per_episode

    # Slice the data for the current episode
    episode_data = final_data_with_summary_df.iloc[start_index:end_index].copy()

    # Assign the current episode ID to the observations in this episode
    episode_data['episode_id'] = f'Episode_{i+1}'

    # Append the episode data to the final list
    final_episode_data.append(episode_data)

# Combine all episodes into one DataFrame
final_episode_data_df = pd.concat(final_episode_data, axis=0)

# Save the final episode DataFrame to a CSV
final_episode_data_df.to_csv("episode_test.csv", index=False)

# Display the updated episode data
print("Final Episode Data with Episode IDs:")
print(final_episode_data_df)


Final Episode Data with Episode IDs:
           obs_id  user_id  Feature1  Feature2  Feature3  Feature4  Feature5   latency  \
0      114_obs_10      114  0.100609  0.000000  0.000000  0.000000  0.090414  0.901165   
1      114_obs_10      114  0.127255  0.000000  0.000000  0.000000  0.012663  0.901165   
2      114_obs_10      114  0.015271  0.020762  0.000000  0.000000  0.295411  0.901165   
3      114_obs_10      114  0.000000  0.111557  0.000000  0.000000  0.460619  0.901165   
4      114_obs_10      114  0.000000  0.178535  0.000000  0.000000  1.195371  0.901165   
5      114_obs_10      114  0.000000  0.050224  0.000000  0.000000  0.431161  0.901165   
6      114_obs_10      114  0.000000  0.295696  0.000000  0.000000  1.036061  0.901165   
7      114_obs_10      114  0.000000  0.387625  0.000000  0.000000  2.148521  0.901165   
8     114_obs_100      114  0.079782  0.000257  0.000000  0.000000  0.176746  0.250732   
9     114_obs_100      114  0.000000  0.081803  0.000000  0.000

In [None]:
print(X_train.dtypes)  # Check column data types in the training data


obs_id            object
Feature1         float64
Feature2         float64
Feature3         float64
Feature4         float64
Feature5         float64
latency          float64
typing_speed     float64
std_time_diff    float64
dtype: object


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from collections import deque
import random

# Load the episode data
train_df = pd.read_csv("final_episodes.csv")
test_df = pd.read_csv("episode_test.csv")

# Preprocess the data - example preprocessing
# Assuming you have features and labels (user_id as the target)
X_train = train_df.drop(columns=['episode_id', 'user_id'])
y_train = train_df['user_id'] == 1  # Legitimate user is 1, corrupted is 0

X_test = test_df.drop(columns=['episode_id', 'user_id'])
y_test = test_df['user_id'] == 1  # Legitimate user is 1, corrupted is 0

# Convert to numpy arrays for training
X_train = X_train.values.astype(np.float32)
X_test = X_test.values.astype(np.float32)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Define the DDQN Model
def build_ddqn_model(input_shape):
    model = models.Sequential()
    model.add(layers.Dense(64, input_dim=input_shape, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(2, activation='linear'))  # Output two actions: legitimate or corrupted
    model.compile(optimizer='adam', loss='mse')
    return model

# Initialize parameters for training
input_shape = X_train.shape[1]
model = build_ddqn_model(input_shape)
target_model = build_ddqn_model(input_shape)

# Copy the model weights to the target model initially
target_model.set_weights(model.get_weights())

# Hyperparameters
gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01  # Minimum epsilon
epsilon_decay = 0.995  # Epsilon decay
batch_size = 32
memory = deque(maxlen=2000)  # Experience replay memory
n_episodes = 1000
training_start = 1000  # Start training after this number of steps

# Store episode rewards for later analysis
episode_rewards = []

# Train the DDQN model
for episode in range(n_episodes):
    state = X_train[episode % len(X_train)]  # Current state from the training data
    state = np.reshape(state, (1, -1))  # Reshape to match input dimensions
    total_reward = 0

    for step in range(len(X_train)):
        # Choose action based on epsilon-greedy policy
        if np.random.rand() <= epsilon:
            action = random.choice([0, 1])  # Random action: 0 = corrupted, 1 = legitimate
        else:
            q_values = model.predict(state)  # Predict Q-values
            action = np.argmax(q_values)  # Choose action with highest Q-value

        # Get next state and reward
        next_state = X_train[(episode + 1) % len(X_train)]  # Next state
        next_state = np.reshape(next_state, (1, -1))  # Reshape to match input dimensions

        # Reward: +1 for legitimate user (user_id == 1), -1 for corrupted (user_id != 1)
        reward = 1 if y_train[episode % len(X_train)] == 1 else -1

        # Store experience in memory
        memory.append((state, action, reward, next_state))

        # Update state for next step
        state = next_state
        total_reward += reward

        # Experience replay
        if len(memory) >= batch_size:
            minibatch = random.sample(memory, batch_size)
            state_batch = []
            action_batch = []
            reward_batch = []
            next_state_batch = []

            for state, action, reward, next_state in minibatch:
                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)
                next_state_batch.append(next_state)

            # Ensure the batches are numpy arrays with the correct shape
            state_batch = np.array(state_batch).reshape(batch_size, -1).astype(np.float32)
            next_state_batch = np.array(next_state_batch).reshape(batch_size, -1).astype(np.float32)
            reward_batch = np.array(reward_batch).astype(np.float32)

            # Predict Q-values for the current states and next states
            target_q_values = model.predict(state_batch)
            next_q_values = target_model.predict(next_state_batch)

            # Double DQN update
            for i in range(batch_size):
                target = reward_batch[i] + gamma * next_q_values[i][np.argmax(target_q_values[i])]
                target_q_values[i][action_batch[i]] = target

            # Update the model
            model.fit(state_batch, target_q_values, epochs=1, verbose=0)

        # Periodically update the target model
        if episode % 10 == 0:
            target_model.set_weights(model.get_weights())

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    episode_rewards.append(total_reward)
    print(f"Episode {episode + 1}/{n_episodes}, Total Reward: {total_reward}")

# Evaluate the model on test data
test_q_values = model.predict(X_test)
test_predictions = np.argmax(test_q_values, axis=1)
accuracy = np.mean(test_predictions == y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


ValueError: could not convert string to float: '100_obs_115'