In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Define date range
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 1, 1)
date_range = pd.date_range(start_date, end_date, freq='D')

# Sample Indian cities for device locations
locations = [
    'Mumbai', 'Delhi', 'Bangalore', 'Hyderabad', 'Chennai',
    'Kolkata', 'Ahmedabad', 'Pune', 'Jaipur', 'Lucknow'
]

# Sample device IDs
device_ids = [f"Device_{i}" for i in range(1, 11)]

# Generate synthetic records
data = []
for _ in range(1000):
    date = random.choice(date_range)
    device = random.choice(device_ids)
    location = random.choice(locations)

    downtime = random.randint(0, 4)  # downtime in hours
    operational_hours = 24 - downtime
    production_quantity = random.randint(800, 1000)
    target_quantity = 1000
    quality_issues = random.randint(0, 20)

    data.append({
        'Date': date.strftime('%Y-%m-%d'),  # formatted to avoid Excel issues
        'Device_ID': device,
        'Location': location,
        'Operational_Hours': operational_hours,
        'Production_Quantity': production_quantity,
        'Target_Production_Quantity': target_quantity,
        'Downtime': downtime,
        'Quality_Issues': quality_issues
    })

# Convert to DataFrame
df = pd.DataFrame(data)

# Export to CSV
csv_filename = "oee_data.csv"
df.to_csv(csv_filename, index=False)

print(f"Dataset created and saved as {csv_filename}")


Dataset created and saved as synthetic_oee_data_indian_cities_cleaned.csv


In [3]:
df['Month'] = pd.to_datetime(df['Date']).dt.month
df['Month_Name'] = pd.to_datetime(df['Date']).dt.strftime('%B')
df.to_csv("oee_data.csv", index=False)



In [4]:
numeric_cols = ['Operational_Hours', 'Production_Quantity', 'Target_Production_Quantity', 'Downtime', 'Quality_Issues']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')


In [5]:
df.dropna(inplace=True) 



In [6]:
print(df[['Date', 'Device_ID', 'Location', 'Month_Name']].head())


         Date Device_ID   Location Month_Name
0  2024-02-23  Device_6    Chennai   February
1  2024-08-11  Device_5      Delhi     August
2  2024-10-28  Device_8    Lucknow    October
3  2024-01-04  Device_8    Chennai    January
4  2024-02-13  Device_6  Hyderabad   February


In [2]:
import pandas as pd

# Load the CSV
df = pd.read_csv("oee_data.csv")
df['Availability'] = (df['Operational_Hours'] - df['Downtime']) / df['Operational_Hours']
df['Performance'] = df['Production_Quantity'] / df['Target_Production_Quantity']
df['Quality'] = (df['Production_Quantity'] - df['Quality_Issues']) / df['Production_Quantity']

df['OEE'] = df['Availability'] * df['Performance'] * df['Quality'] * 100
df['OEE'] = df['OEE'].round(2)
df.to_csv("oee_data.csv", index=False)



In [8]:
df.dropna(subset=['OEE'], inplace=True)  # Remove rows where OEE couldn't be computed


In [9]:
print(df.head())


         Date Device_ID   Location  Operational_Hours  Production_Quantity  \
0  2024-02-23  Device_6    Chennai                 21                  967   
1  2024-08-11  Device_5      Delhi                 23                  824   
2  2024-10-28  Device_8    Lucknow                 22                  891   
3  2024-01-04  Device_8    Chennai                 20                  970   
4  2024-02-13  Device_6  Hyderabad                 20                 1000   

   Target_Production_Quantity  Downtime  Quality_Issues  Month Month_Name  \
0                        1000         3              11      2   February   
1                        1000         1              14      8     August   
2                        1000         2              15     10    October   
3                        1000         4              10      1    January   
4                        1000         4              18      2   February   

   Availability  Performance   Quality    OEE  
0      0.857143     