In [7]:
import pandas as pd
column_names = [
    'Timestamp_ms',
    'CPU_cores',
    'CPU_capacity_provisioned_MHZ',
    'CPU_usage_MHZ',
    'CPU_usage_percent',
    'Memory_capacity_provisioned_KB',
    'Memory_usage_KB',
    'Disk_read_throughput_KB_s',
    'Disk_write_throughput_KB_s',
    'Network_received_throughput_KB_s',
    'Network_transmitted_throughput_KB_s',
    'vm_id'
]


In [8]:
def custom_parser(line):
    # Split on semicolons first
    parts = line.strip().split(';\t')
    # Split the last part containing both network throughput and vm_id
    last_part = parts[-1].split(',')
    return parts[:-1] + last_part


with open('C:/Users/shrey-keda-nk/Desktop/dev/data/full_dataset/dataset1.csv', 'r') as f:
    # Skip header
    next(f)
    # Process lines with custom parser
    data = [custom_parser(line) for line in f]

In [9]:
df = pd.DataFrame(data, columns=column_names)

In [10]:
numeric_cols = column_names[2:-1]  # All columns except Timestamp, CPU_cores, and vm_id
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df['Timestamp'] = pd.to_datetime(df['Timestamp_ms'], unit='ms')
df['vm_id'] = df['vm_id'].astype('category')

  df['Timestamp'] = pd.to_datetime(df['Timestamp_ms'], unit='ms')


In [11]:
df = df.drop('Timestamp_ms', axis=1)

In [12]:
print("Successfully loaded data:")
print(df.head())

Successfully loaded data:
  CPU_cores  CPU_capacity_provisioned_MHZ  CPU_usage_MHZ  CPU_usage_percent  \
0         4                   11703.99824   10912.027692          93.233333   
1         4                   11703.99824   10890.570362          93.050000   
2         4                   11703.99824   10434.114431          89.150000   
3         4                   11703.99824   10539.450415          90.050000   
4         4                   11703.99824   10951.041020          93.566667   

   Memory_capacity_provisioned_KB  Memory_usage_KB  Disk_read_throughput_KB_s  \
0                      67108864.0     6.129274e+06                   0.133333   
1                      67108864.0     6.755624e+06                   1.333333   
2                      67108864.0     8.947846e+06                   2.533333   
3                      67108864.0     1.879048e+07                   5.466667   
4                      67108864.0     9.305761e+06                   5.400000   

   Disk_writ

In [13]:
# Check columns
print("\nColumns:", df.columns.tolist())

# Check data types
print("\nData types:")
print(df.dtypes)

# Check for duplicates
print("\nDuplicate columns:", df.columns[df.columns.duplicated()].tolist())


Columns: ['CPU_cores', 'CPU_capacity_provisioned_MHZ', 'CPU_usage_MHZ', 'CPU_usage_percent', 'Memory_capacity_provisioned_KB', 'Memory_usage_KB', 'Disk_read_throughput_KB_s', 'Disk_write_throughput_KB_s', 'Network_received_throughput_KB_s', 'Network_transmitted_throughput_KB_s', 'vm_id', 'Timestamp']

Data types:
CPU_cores                                      object
CPU_capacity_provisioned_MHZ                  float64
CPU_usage_MHZ                                 float64
CPU_usage_percent                             float64
Memory_capacity_provisioned_KB                float64
Memory_usage_KB                               float64
Disk_read_throughput_KB_s                     float64
Disk_write_throughput_KB_s                    float64
Network_received_throughput_KB_s              float64
Network_transmitted_throughput_KB_s           float64
vm_id                                        category
Timestamp                              datetime64[ns]
dtype: object

Duplicate columns: []

In [14]:

from sklearn.preprocessing import MinMaxScaler
import numpy as np
df['Disk_read_throughput_KB_s'] = df['Disk_read_throughput_KB_s'].apply(lambda x: f'{x:.10f}')
df['Network_transmitted_throughput_KB_s'] = df['Network_transmitted_throughput_KB_s'].apply(lambda x: f'{x:.10f}')


numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != 'vm_id']


scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
print("Normalized Data:")
print(df.head())

Normalized Data:
  CPU_cores  CPU_capacity_provisioned_MHZ  CPU_usage_MHZ  CPU_usage_percent  \
0         4                      0.135463       0.170830           0.499643   
1         4                      0.135463       0.170494           0.498660   
2         4                      0.135463       0.163348           0.477760   
3         4                      0.135463       0.164997           0.482583   
4         4                      0.135463       0.171441           0.501429   

   Memory_capacity_provisioned_KB  Memory_usage_KB Disk_read_throughput_KB_s  \
0                        0.125131         0.015222              0.1333333333   
1                        0.125131         0.016778              1.3333333333   
2                        0.125131         0.022222              2.5333333333   
3                        0.125131         0.046667              5.4666666667   
4                        0.125131         0.023111              5.4000000000   

   Disk_write_throughput_KB

In [15]:
df['time'] = df['Timestamp'].dt.strftime('%H:%M:%S.%f')
df = df.drop(columns=['Timestamp'])

print(df)

         CPU_cores  CPU_capacity_provisioned_MHZ  CPU_usage_MHZ  \
0                4                      0.135463       0.170830   
1                4                      0.135463       0.170494   
2                4                      0.135463       0.163348   
3                4                      0.135463       0.164997   
4                4                      0.135463       0.171441   
...            ...                           ...            ...   
11221795         4                      0.111111       0.000000   
11221796         4                      0.111111       0.000000   
11221797         4                      0.111111       0.000000   
11221798         4                      0.111111       0.000000   
11221799         4                      0.111111       0.000000   

          CPU_usage_percent  Memory_capacity_provisioned_KB  Memory_usage_KB  \
0                  0.499643                        0.125131         0.015222   
1                  0.498660        

In [16]:
# Split the DataFrame into test and train based on VM ID ranges
df['vm_id'] = pd.to_numeric(df['vm_id'])
test_df = df[df['vm_id'].between(1000, 1250)]  
train_df = df[df['vm_id'].between(0,1000)]

# Validate the split
print(f"Test set VM ID range: {test_df['vm_id'].min()}–{test_df['vm_id'].max()}")
print(f"Train set VM ID range: {train_df['vm_id'].min()}–{train_df['vm_id'].max()}")
print(f"Test size: {len(test_df)} rows")
print(f"Train size: {len(train_df)} rows")
test_df.to_csv('test_df.csv', index=False)
train_df.to_csv('train_df.csv', index=False)


Test set VM ID range: 1000–1249
Train set VM ID range: 0–1000
Test size: 2186570 rows
Train size: 9043844 rows


In [17]:
test_df.head()

Unnamed: 0,CPU_cores,CPU_capacity_provisioned_MHZ,CPU_usage_MHZ,CPU_usage_percent,Memory_capacity_provisioned_KB,Memory_usage_KB,Disk_read_throughput_KB_s,Disk_write_throughput_KB_s,Network_received_throughput_KB_s,Network_transmitted_throughput_KB_s,vm_id,time
9035230,2,0.060185,0.00076,0.005002,0.015397,0.001389,3.6,0.000122,1.971663e-06,2.2,1000,22:18:34.846000
9035231,2,0.060185,0.00057,0.003751,0.015397,0.001736,0.0,6.1e-05,3.033328e-07,0.3333333333,1000,22:18:35.146000
9035232,2,0.060185,0.000407,0.00268,0.015397,0.001333,0.0,6.2e-05,6.066655e-07,0.8,1000,22:18:35.446000
9035233,2,0.060185,0.00057,0.003751,0.015397,0.001389,0.0,6.1e-05,3.79166e-07,0.3333333333,1000,22:18:35.746000
9035234,2,0.060185,0.000434,0.002858,0.015397,0.001153,0.0,6.5e-05,1.516664e-07,0.1333333333,1000,22:18:36.046000
