# VM Migration Data Processing

This notebook demonstrates data processing and model training for VM migration prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [None]:
# Generate sample data
np.random.seed(42)
n_samples = 1000

data = {
    'cpu_load': np.random.uniform(10, 100, n_samples),
    'memory_usage': np.random.uniform(20, 95, n_samples),
    'disk_io': np.random.uniform(5, 100, n_samples),
    'network_bandwidth': np.random.uniform(50, 1000, n_samples)
}

df = pd.DataFrame(data)
df.head()

In [None]:
# Create target variable (downtime)
base_downtime = 50
cpu_impact = df['cpu_load'] * 0.8
memory_impact = df['memory_usage'] * 0.6
disk_impact = df['disk_io'] * 0.4
network_impact = (1000 - df['network_bandwidth']) * 0.1
noise = np.random.normal(0, 10, n_samples)

df['downtime'] = base_downtime + cpu_impact + memory_impact + disk_impact + network_impact + noise
df['downtime'] = np.maximum(df['downtime'], 10)  # Minimum 10ms

print(f'Dataset shape: {df.shape}')
df.describe()

In [None]:
# Visualize data
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Distribution plots
for i, col in enumerate(df.columns):
    row, col_idx = divmod(i, 3)
    if row < 2:
        axes[row, col_idx].hist(df[col], bins=30, alpha=0.7)
        axes[row, col_idx].set_title(f'{col} Distribution')
        axes[row, col_idx].set_xlabel(col)
        axes[row, col_idx].set_ylabel('Frequency')

# Remove empty subplot
fig.delaxes(axes[1, 2])
plt.tight_layout()
plt.show()