In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import random
import os

# --- Optimized Configuration (UPDATED) ---
N_DEVICES = 50
N_FAILING_DEVICES = 15 # (30% failure rate)
TIME_START = datetime(2023, 1, 1)
TIME_END = datetime(2023, 12, 31)
SAMPLING_INTERVAL_HOURS = 6
OUTPUT_CSV = 'smartphone_unified_dataset_v2.csv' # New filename

# --- Function to generate data for a single device ---
def generate_device_data(device_id, failure_info):
    timestamps = pd.to_datetime(np.arange(TIME_START, TIME_END, timedelta(hours=SAMPLING_INTERVAL_HOURS)))
    n_samples = len(timestamps)
    df = pd.DataFrame({'timestamp': timestamps, 'device_id': device_id})

    # --- Generate Healthy Features ---
    df['battery_level'] = 100 - (np.sin(np.arange(n_samples) / 4 * np.pi) + 1) * 40 + np.random.normal(0, 2, n_samples)
    df['cpu_usage_percent'] = np.random.uniform(5, 30, n_samples) + np.sin(np.arange(n_samples) / 50) * 10
    df['memory_usage_percent'] = np.random.uniform(40, 60, n_samples)
    df['storage_usage_percent'] = np.linspace(20, 80, n_samples)
    df['app_crashes'] = np.random.poisson(0.05, n_samples)
    df['network_signal_strength_dbm'] = np.random.uniform(-110, -80, n_samples)
    df['screen_on_time_minutes'] = np.random.exponential(30, n_samples)
    df['fast_charging_active'] = np.random.choice([0, 1], size=n_samples, p=[0.8, 0.2])
    df['speaker_volume_percent'] = np.random.uniform(0, 100, n_samples)
    df['ambient_temp_c'] = np.random.uniform(15, 35, n_samples) + np.sin(np.arange(n_samples) / 1460 * 2 * np.pi) * 5

    df['failure_type'], df['days_until_failure'], df['is_failing_soon'] = 0, 999.0, 0

    if failure_info['is_failing']:
        failure_point = int(n_samples * failure_info['failure_point_ratio'])
        degradation_period = int(n_samples * 0.20) # A slightly longer degradation period
        degradation_start = failure_point - degradation_period
        degradation_factor = np.linspace(0, 1, n_samples - degradation_start)

        # --- More Aggressive Failure Signals ---
        if failure_info['failure_type'] == 1: # Battery
            df.loc[degradation_start:, 'battery_level'] *= (1 - degradation_factor * 0.8) # Drains to 20%
            df.loc[degradation_start:, 'ambient_temp_c'] += degradation_factor * 15 # Gets hot
        elif failure_info['failure_type'] == 2: # CPU
            df.loc[degradation_start:, 'cpu_usage_percent'] += degradation_factor * 60 # Higher CPU load
            df.loc[degradation_start:, 'ambient_temp_c'] += degradation_factor * 25 # Gets very hot
        elif failure_info['failure_type'] == 3: # Memory
            df.loc[degradation_start:, 'memory_usage_percent'] += degradation_factor * 45 # Higher memory load
            df.loc[degradation_start:, 'app_crashes'] += np.random.poisson(1.5, len(degradation_factor)) # More crashes

        for i in range(degradation_start, failure_point):
            days_left = (df.loc[failure_point, 'timestamp'] - df.loc[i, 'timestamp']).total_seconds() / (3600 * 24)
            df.loc[i, 'days_until_failure'] = days_left
            df.loc[i, 'failure_type'] = failure_info['failure_type']
            if days_left <= 7:
                df.loc[i, 'is_failing_soon'] = 1

    # Clip values
    for col in ['battery_level', 'cpu_usage_percent', 'memory_usage_percent', 'storage_usage_percent']:
        df[col] = df[col].clip(0, 100)

    return df

# --- Main Generation Loop ---
# (Same as before, but ensure you delete the old file if it exists)
OUTPUT_CSV = 'smartphone_unified_dataset_v2.csv'
if os.path.exists(OUTPUT_CSV):
    os.remove(OUTPUT_CSV)

all_device_ids = [f"device_{i:03d}" for i in range(N_DEVICES)]
failing_device_ids = random.sample(all_device_ids, N_FAILING_DEVICES)
failure_types = [1, 2, 3] # 1:Battery, 2:CPU, 3:Memory
failure_infos = {}
for i, device_id in enumerate(failing_device_ids):
    failure_infos[device_id] = {
        "is_failing": True,
        "failure_type": failure_types[i % len(failure_types)],
        "failure_point_ratio": random.uniform(0.8, 0.95)
    }

print(f"Generating new dataset '{OUTPUT_CSV}'...")
header_written = False
for i in tqdm(range(N_DEVICES), desc="Simulating Devices"):
    device_id = f"device_{i:03d}"
    info = failure_infos.get(device_id, {"is_failing": False})
    device_df = generate_device_data(device_id, info)
    device_df.to_csv(OUTPUT_CSV, mode='a', header=not header_written, index=False)
    header_written = True

print(f"\n✅ New, more aggressive dataset generated successfully!")

Generating new dataset 'smartphone_unified_dataset_v2.csv'...


Simulating Devices: 100%|██████████| 50/50 [00:05<00:00,  9.16it/s]


✅ New, more aggressive dataset generated successfully!





In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib
from tqdm import tqdm
import os

# --- Configuration ---
INPUT_CSV = 'smartphone_unified_dataset_v2.csv' # Using the new, larger dataset
SEQUENCE_TIMESTEPS = 56 # 14 days * 4 readings/day

def prepare_large_dataset():
    """
    Prepares the large, unified dataset for the multi-task LSTM model,
    optimized for memory efficiency.
    """
    print("--- Phase 2: Preparing Large Dataset for Multi-Task Model ---")

    # --- 1. Load Data (Memory Optimized) ---
    print(f"Loading '{INPUT_CSV}'...")
    try:
        dtype_map = {
            'battery_level': 'float32', 'cpu_usage_percent': 'float32',
            'memory_usage_percent': 'float32', 'storage_usage_percent': 'float32',
            'app_crashes': 'int8', 'network_signal_strength_dbm': 'float32',
            'screen_on_time_minutes': 'float32', 'fast_charging_active': 'int8',
            'speaker_volume_percent': 'float32', 'ambient_temp_c': 'float32',
            'failure_type': 'int8', 'days_until_failure': 'float32', 'is_failing_soon': 'int8'
        }
        df = pd.read_csv(INPUT_CSV, dtype=dtype_map, parse_dates=['timestamp'])
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV}' was not found. Please ensure it was generated correctly.")
        return

    df.sort_values(by=['device_id', 'timestamp'], inplace=True)
    print("Data loaded successfully.")
    df.info(memory_usage='deep')

    # --- 2. Scale Features ---
    feature_columns = ['battery_level', 'cpu_usage_percent', 'memory_usage_percent', 'storage_usage_percent', 'app_crashes', 'network_signal_strength_dbm', 'screen_on_time_minutes', 'fast_charging_active', 'speaker_volume_percent', 'ambient_temp_c']

    print("\nScaling features...")
    scaler = MinMaxScaler()
    df[feature_columns] = scaler.fit_transform(df[feature_columns])
    joblib.dump(scaler, 'multi_task_scaler.joblib')
    print("Scaler saved to 'multi_task_scaler.joblib'.")

    # --- 3. Create Sequences ---
    print(f"\nCreating sequences of {SEQUENCE_TIMESTEPS} timesteps...")
    sequences, labels_why, labels_when, labels_if = [], [], [], []

    # Group by device to ensure sequences don't cross over different devices
    grouped_data = df.groupby('device_id')

    for _, group in tqdm(grouped_data, desc="Creating Sequences"):
        features = group[feature_columns].values
        label_why = group['failure_type'].values
        label_when = group['days_until_failure'].values
        label_if = group['is_failing_soon'].values

        # Slide a window across each group's data
        for i in range(len(features) - SEQUENCE_TIMESTEPS):
            sequences.append(features[i:i + SEQUENCE_TIMESTEPS])
            # The label corresponds to the END of the sequence window
            labels_why.append(label_why[i + SEQUENCE_TIMESTEPS - 1])
            labels_when.append(label_when[i + SEQUENCE_TIMESTEPS - 1])
            labels_if.append(label_if[i + SEQUENCE_TIMESTEPS - 1])

    X = np.array(sequences)
    y_why = np.array(labels_why)
    y_when = np.array(labels_when)
    y_if = np.array(labels_if)

    # --- 4. Save Processed Data ---
    print("\nSaving processed data to .npy files...")
    np.save('sequences_large.npy', X)
    np.save('labels_why_large.npy', y_why)
    np.save('labels_when_large.npy', y_when)
    np.save('labels_if_large.npy', y_if)

    print("\n--- Data Preparation Summary ---")
    print(f"Total sequences created: {X.shape[0]}")
    print(f"Sequence shape (X): {X.shape}")
    print(f"Labels 'Why' shape (y_why): {y_why.shape}")
    print(f"Labels 'When' shape (y_when): {y_when.shape}")
    print(f"Labels 'If' shape (y_if): {y_if.shape}")
    print("\n✅ Data preparation complete. You are now ready to train the multi-task model.")

if __name__ == "__main__":
    prepare_large_dataset()

--- Phase 2: Preparing Large Dataset for Multi-Task Model ---
Loading 'smartphone_unified_dataset_v2.csv'...
Data loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72800 entries, 0 to 72799
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   timestamp                    72800 non-null  datetime64[ns]
 1   device_id                    72800 non-null  object        
 2   battery_level                72800 non-null  float32       
 3   cpu_usage_percent            72800 non-null  float32       
 4   memory_usage_percent         72800 non-null  float32       
 5   storage_usage_percent        72800 non-null  float32       
 6   app_crashes                  72800 non-null  int8          
 7   network_signal_strength_dbm  72800 non-null  float32       
 8   screen_on_time_minutes       72800 non-null  float32       
 9   fast_charging_active         72800 n

Creating Sequences: 100%|██████████| 50/50 [00:00<00:00, 340.01it/s]



Saving processed data to .npy files...

--- Data Preparation Summary ---
Total sequences created: 70000
Sequence shape (X): (70000, 56, 10)
Labels 'Why' shape (y_why): (70000,)
Labels 'When' shape (y_when): (70000,)
Labels 'If' shape (y_if): (70000,)

✅ Data preparation complete. You are now ready to train the multi-task model.
