In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Step 2: Load Dataset
file_path = r"C:\Users\SD LAB\Documents\nivetha\pcb_cycle_dataset_core.csv"
df = pd.read_csv(file_path)
print("Dataset Loaded Successfully!\n")
print(df.head())

# Step 3: Inspect Dataset
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())

# Step 4: Drop duplicates (if any)
df = df.drop_duplicates()

# Step 5: Encode Categorical Variables
df_encoded = pd.get_dummies(df, columns=['machine_type', 'shift'], drop_first=True)
print("\nEncoded Dataset:")
print(df_encoded.head())

# Step 6: Feature Scaling (optional for tree models, necessary for Linear Regression/NN)
numeric_features = ['num_components', 'board_layers', 'component_density', 'operator_experience']
scaler = StandardScaler()
df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])

# Step 7: Split Dataset into Features and Target
X = df_encoded.drop('cycle_time', axis=1)
y = df_encoded['cycle_time']

# Step 8: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nDataset ready for ML!")
print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


Dataset Loaded Successfully!

   num_components  board_layers  component_density machine_type  \
0             152             6               0.87            A   
1             229             4               4.44            A   
2             142             6               2.98            B   
3              64             6               1.24            C   
4             156             4               2.35            B   

   operator_experience  shift  cycle_time  
0                    7    Day        85.4  
1                    1    Day       121.8  
2                    5    Day        76.4  
3                    8  Night        48.4  
4                    4    Day        80.8  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   num_components       500 non-null    int64  
 1   board_layers         500 non-