In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import joblib
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


In [2]:
# Starting point for timestamp
start_time = datetime.strptime("Sat Nov 16 10:28:01 UTC 2024", "%a %b %d %H:%M:%S %Z %Y")

# Number of rows to generate
num_rows = 1000

# Generate timestamps
timestamps = [start_time + timedelta(minutes=i) for i in range(num_rows)]

# Generate synthetic CPU, memory, and disk usage data
np.random.seed(42)  # For reproducibility
cpu_usage = np.abs(np.random.normal(loc=10, scale=5, size=num_rows))  # Mean 10%, stddev 5%
memory_usage = np.random.uniform(low=14.2, high=14.3, size=num_rows)  # Between 14.2 and 14.3
disk_usage = np.random.choice([1, 2], size=num_rows, p=[0.8, 0.2])  # Mostly 1, some 2

# Create a DataFrame
large_data = pd.DataFrame({
    "timestamp": [t.strftime("%a %b %d %H:%M:%S UTC %Y") for t in timestamps],
    "cpu_usage": cpu_usage,
    "memory_usage": memory_usage,
    "disk_usage": disk_usage
})

# Save to CSV
large_data.to_csv("D:\AI_powered_system_monitor\large_system_usage.csv", index=False)
print("Generated synthetic dataset with 1000 rows and saved as 'large_system_usage.csv'.")


Generated synthetic dataset with 1000 rows and saved as 'large_system_usage.csv'.


In [3]:
# Load the dataset
csv_path="D:\AI_powered_system_monitor\large_system_usage.csv"
data = pd.read_csv(csv_path)

# Display basic info
print("Dataset Overview:")
print(data.head())



Dataset Overview:
                      timestamp  cpu_usage  memory_usage  disk_usage
0  Sat Nov 16 10:28:01 UTC 2024  12.483571     14.216748           1
1  Sat Nov 16 10:29:01 UTC 2024   9.308678     14.210457           1
2  Sat Nov 16 10:30:01 UTC 2024  13.238443     14.263643           1
3  Sat Nov 16 10:31:01 UTC 2024  17.615149     14.270648           1
4  Sat Nov 16 10:32:01 UTC 2024   8.829233     14.203159           2


In [11]:
data.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     1000 non-null   object 
 1   cpu_usage     1000 non-null   float64
 2   memory_usage  1000 non-null   float64
 3   disk_usage    1000 non-null   int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 31.4+ KB
None


In [12]:
data.isnull().sum()


Missing Values:
timestamp       0
cpu_usage       0
memory_usage    0
disk_usage      0
dtype: int64


In [13]:
columns = data.columns.tolist()
print(f"\nColumns in the dataset: {columns}")


Columns in the dataset: ['timestamp', 'cpu_usage', 'memory_usage', 'disk_usage']


In [4]:
# Preprocess the data
data['Timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce')  # Handles parsing errors
data['Seconds'] = (data['Timestamp'] - data['Timestamp'].min()).dt.total_seconds()
X = data[['Seconds']]
y = data['cpu_usage'].astype(float)  # Target: CPU Usage

# Train the model
model = RandomForestRegressor()
model.fit(X, y)

# Save the trained model
joblib.dump(model, "cpu_usage_predictor.pkl")
print("Model saved as cpu_usage_predictor.pkl")


# Ensure correct data types (if needed)
data['cpu_usage'] = data['cpu_usage'].astype(float)
data['memory_usage'] = data['memory_usage'].astype(float)
data['disk_usage'] = data['disk_usage'].astype(int)

# Feature (X) and Target (y)
X = data[['memory_usage', 'disk_usage']]  # Input features (add 'cpu_usage' if you want it as an input instead of target)
y = data['cpu_usage']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save preprocessed data for training
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("\nData preprocessing completed. Training and testing datasets saved.")
# Load the preprocessed training data
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv").squeeze()  # Convert to Series
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv").squeeze()
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
import joblib

# Save the trained model
joblib.dump(model, 'cpu_usage_predictor_model.pkl')

Model saved as cpu_usage_predictor.pkl
