In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import string

# Set seed for reproducibility
np.random.seed(42)

# Common parameters
num_rows = 5000

# ==================================================================
# 1. Energy Usage Dataset
# ==================================================================
def generate_energy_usage(num_rows):

    # Date range setup
    start_date = datetime(2020, 1, 1)
    end_date = datetime.now()
    
    # Generate random integer timestamps (seconds since epoch)
    start_u = int(start_date.timestamp())
    end_u = int(end_date.timestamp())
    random_ts = np.random.randint(start_u, end_u, num_rows)

    timestamps = [datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") for ts in random_ts]


    data = {
        "Server ID": [f"Server-{i:04d}" for i in range(1, num_rows+1)],
        "Energy Consumption (kWh)": np.round(np.random.normal(500, 150, num_rows), 2),
        "Cooling Efficiency (%)": np.random.randint(50, 95, num_rows),
        "Power Usage Effectiveness (PUE)": np.round(np.random.uniform(1.0, 2.0, num_rows), 2),
        "Timestamp": timestamps,
        "Server Location": np.random.choice(["Amsterdam", "Berlin", "Dublin", "Germany", "Lisbon", "London", "Paris", "Rome", "Stockholm", "Vienna"], num_rows),
        "Server Type": np.random.choice(["Enterprise", "Cloud", "Edge", "Mainframe", "Network Node", "Backup Server"], num_rows),
        "Energy Source": np.random.choice(["Battery Backup", "Wind", "Solar", "Grid", "Geothermal", "Biomass", "Hydro"], num_rows,
                                          p=[0.15, 0.25, 0.20, 0.25, 0.05, 0.05, 0.05]),

        "Cooling System Type": np.random.choice(["Liquid Cooling", "Evaporative Cooling", "Fan Cooling", "Compressor Cooling", 
                                                 "Heat Pipe Cooling", "Phase Change Cooling", "Peltier Cooling", "Thermoelectric Cooling"], num_rows),
        "Energy Cost (€)": np.round(np.random.uniform(0.15, 0.30, num_rows) * 
                                  np.random.normal(500, 150, num_rows), 2),
        "Carbon Emissions (kg CO2)": np.round(np.random.choice([0.25, 0.75], num_rows) * 
                                            np.random.normal(500, 150, num_rows), 2),
        "Maintenance Status": np.random.choice(["Needs Maintenance", "Active", "Inactive", "Out of Service", "Retired", "Under Repair"], num_rows, 
                                               p=[0.3, 0.3, 0.1, 0.1, 0.1, 0.1]),
        "Energy Savings Target (%)": np.random.randint(15, 35, num_rows),
        "Peak Energy Usage (kWh)": np.round(np.random.normal(1000, 50, num_rows)),
        "Off-Peak Energy Usage (kWh)": np.round(np.random.normal(200, 50, num_rows)),
    }
    
    df = pd.DataFrame(data)
    df["Year"] = pd.to_datetime(df["Timestamp"]).dt.year
    df["Month"] = pd.to_datetime(df["Timestamp"]).dt.month
    df["Day"] = pd.to_datetime(df["Timestamp"]).dt.day
    return df

energy_df = generate_energy_usage(num_rows)

# ==================================================================
# 2. Server Performance Dataset
# ==================================================================
def generate_server_performance(num_rows):

     # Date range setup
    start_date = datetime(2020, 1, 1)
    end_date = datetime.now()
    
    # Generate random integer timestamps (seconds since epoch)
    start_u = int(start_date.timestamp())
    end_u = int(end_date.timestamp())
    random_ts = np.random.randint(start_u, end_u, num_rows)

    timestamps = [datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") for ts in random_ts]

    data = {
        "Server ID": [f"Server-{i:04d}" for i in range(1, num_rows+1)],
        "Uptime (%)": np.round(np.random.uniform(95.0, 100.0, num_rows), 1),
        "Workload (CPU %)": np.random.randint(0, 100, num_rows),
        "Temperature (°C)": np.random.randint(25, 45, num_rows),
        "Timestamp": timestamps,
        "Server Location": np.random.choice(["Amsterdam", "Berlin", "Dublin", "Germany", "Lisbon", "London", "Paris", "Rome", "Stockholm", "Vienna"], num_rows),
        "Server Type": np.random.choice(["Enterprise", "Cloud", "Edge", "Mainframe", "Network Node", "Backup Server"], num_rows),
        "Memory Usage (%)": np.random.randint(50, 95, num_rows),
        "Disk Usage (%)": np.random.randint(50, 95, num_rows),
        "Error Rate (%)": np.random.randint(0, 5, num_rows),
        "Carbon Emissions (kg CO2)": np.round(np.random.normal(150, 50, num_rows), 2),
        "Maintenance Status": np.random.choice(["Needs Maintenance", "Active", "Inactive", "Out of Service", "Retired", "Under Repair"], num_rows, 
                                               p=[0.3, 0.3, 0.1, 0.1, 0.1, 0.1]),        
        "Power State": np.random.choice(["On", "Off", "Standby"], num_rows, p=[0.75, 0.10, 0.15]),
        "Fan Speed (RPM)": np.random.randint(2000, 5000, num_rows),
        "Latency (ms)": np.random.randint(1, 100, num_rows),
    }
    
    df = pd.DataFrame(data)
    df["Year"] = pd.to_datetime(df["Timestamp"]).dt.year
    df["Month"] = pd.to_datetime(df["Timestamp"]).dt.month
    df["Day"] = pd.to_datetime(df["Timestamp"]).dt.day
    return df

server_df = generate_server_performance(num_rows)

# ==================================================================
# 3. Vendor Dataset
# ==================================================================

vendors = ["TechOptima", "GreenCompute", "EcoInfra", "PowerSolutions", "CoolIT"]

def generate_vendor_data(num_rows):
    # Generate vendor names first
    vendor_names = np.random.choice(vendors, num_rows)
    
    # Create email characters pool
    email_chars = list(string.ascii_lowercase + string.digits + '-_')
    
    data = {
        "Vendor ID": [f"Vendor-{i:04d}" for i in range(1, num_rows+1)],
        "Vendor Name": vendor_names,
        "Technology Type": np.random.choice(["Renewable Energy", "Hardware", "Software", "Monitoring"], num_rows),
        "Cost (€)": np.round(np.random.uniform(5000, 25000, num_rows), 1),
        "Energy Savings (%)": np.random.randint(10, 30, num_rows),
        "Vendor Location": np.random.choice(["Munich", "Cork", "Porto", "Cambridge", "Lyon", "Milan", "Gothenburg", "Graz", "Warsaw"], num_rows),
        "Contract Duration (Months)": np.random.choice([12, 24, 36, 48, 60], num_rows),
        "Technology ROI (%)": np.random.randint(5, 25, num_rows),
        "Installation Time (Days)": np.random.randint(1, 14, num_rows),
        "Maintenance Cost (€/Year)": np.round(np.random.uniform(500, 5000, num_rows), 2),
        "Vendor Rating (1-5)": np.random.choice([1, 2, 3, 4, 5], num_rows, p=[0.05, 0.1, 0.2, 0.5, 0.15]),
        "Technology Lifespan (Years)": np.random.randint(3, 20, num_rows),
        "Carbon Reduction (kg CO2/Year)": np.random.randint(1000, 11000, num_rows),
        "Vendor Contact": [
            f"{''.join(np.random.choice(email_chars, size=10))}@{name.lower()}.com" 
            for name in vendor_names
        ],
        "Technology Status": np.random.choice(["Active", "Inactive", "On Hold", "Pending", "In Development", "Cancelled"], num_rows,
            p=[0.4, 0.1, 0.1, 0.1, 0.2, 0.1]
        ),        
    }
    
    return pd.DataFrame(data)

vendor_df = generate_vendor_data(5000)

# ==================================================================
# Save to CSV
# ==================================================================
energy_df.to_csv("Energy_Usage_Dataset.csv", index=False)
server_df.to_csv("Server_Performance_Dataset.csv", index=False)
vendor_df.to_csv("Vendor_Dataset.csv", index=False)

print("Datasets generated successfully!")

Datasets generated successfully!
