In [9]:
import os
import pandas as pd
from tqdm import tqdm  # For progress tracking

def load_and_combine_data(data_dir):
    all_dfs = []

    # Sort filenames that start with a digit and end with .csv
    csv_files = sorted(f for f in os.listdir(data_dir) if f.endswith('.csv') and f[0].isdigit()) 

    for idx, file in tqdm(enumerate(csv_files), total=len(csv_files)):
        file_path = os.path.join(data_dir, file)
        df = pd.read_csv(file_path)
        df['vm_id'] = idx % 1250  # Ensure unique vm_id within 1250 range
        all_dfs.append(df)

    combined_df = pd.concat(all_dfs, ignore_index=True)
    return combined_df



# Usage
full_dataset = load_and_combine_data('C:/Users/shrey-keda-nk/Desktop/dev/data')

100%|██████████| 1250/1250 [00:24<00:00, 50.39it/s]


In [10]:
def save_combined_data(df, save_path):
    df.to_csv(save_path, index=False)
    print(f"\n Data saved to: {save_path}")

In [11]:
save_path = 'C:/Users/shrey-keda-nk/Desktop/dev/data/full_dataset/dataset1.csv'
save_combined_data(full_dataset, save_path )


 Data saved to: C:/Users/shrey-keda-nk/Desktop/dev/data/full_dataset/dataset1.csv


In [12]:
full_dataset.head(10)

Unnamed: 0,Timestamp [ms];\tCPU cores;\tCPU capacity provisioned [MHZ];\tCPU usage [MHZ];\tCPU usage [%];\tMemory capacity provisioned [KB];\tMemory usage [KB];\tDisk read throughput [KB/s];\tDisk write throughput [KB/s];\tNetwork received throughput [KB/s];\tNetwork transmitted throughput [KB/s],vm_id
0,1376314846;\t4;\t11703.99824;\t10912.027692426...,0
1,1376315146;\t4;\t11703.99824;\t10890.57036232;...,0
2,1376315446;\t4;\t11703.99824;\t10434.11443096;...,0
3,1376315746;\t4;\t11703.99824;\t10539.450415120...,0
4,1376316046;\t4;\t11703.99824;\t10951.041019893...,0
5,1376316346;\t4;\t11703.99824;\t10913.978358800...,0
6,1376316646;\t4;\t11703.99824;\t10855.4583676;\...,0
7,1376316946;\t4;\t11703.99824;\t10157.119805946...,0
8,1376317246;\t4;\t11703.99824;\t10477.029091173...,0
9,1376317546;\t4;\t11703.99824;\t11128.551659866...,0
