# Semiconductor Manufacturing Process

### Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib

### Load Dataset

In [7]:
df = pd.read_csv("sensor-data.csv")
print(df.info())
print(df.describe())

# Print column names to verify the target column
print("Column names:", df.columns)

# Strip any extra spaces from column names
df.columns = df.columns.str.strip()

# Drop 'Time' column if present
if "Time" in df.columns:
    df = df.drop(columns=["Time"])

# Identify the correct target column
target_col = "Pass/Fail"
if target_col not in df.columns:
    raise KeyError(f"Target column '{target_col}' not found! Check column names: {df.columns}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Time to Pass/Fail
dtypes: float64(590), int64(1), object(1)
memory usage: 7.1+ MB
None
                 0            1            2            3            4  \
count  1561.000000  1560.000000  1553.000000  1553.000000  1553.000000   
mean   3014.452896  2495.850231  2200.547318  1396.376627     4.197013   
std      73.621787    80.407705    29.513152   441.691640    56.355540   
min    2743.240000  2158.750000  2060.660000     0.000000     0.681500   
25%    2966.260000  2452.247500  2181.044400  1081.875800     1.017700   
50%    3011.490000  2499.405000  2201.066700  1285.214400     1.316800   
75%    3056.650000  2538.822500  2218.055500  1591.223500     1.525700   
max    3356.350000  2846.440000  2315.266700  3715.041700  1114.536600   

            5            6            7            8            9  ...  \
count  1553.0  1553.000000  1558.000000  1565.000000  1565.000000  ...   
me

### Missing Value Treatment

In [8]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Handle missing values 
df = df.dropna(axis=1) 

0       6
1       7
2      14
3      14
4      14
       ..
585     1
586     1
587     1
588     1
589     1
Length: 538, dtype: int64


### Data Preprocessing

In [10]:
# Separate features and target
y = df[target_col]  # Target column
X = df.drop(columns=[target_col])  

# Balance the target variable using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

### Model training and testing 

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Model Performance:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Hyperparameter tuning for the best model (example: Random Forest)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("\nBest Parameters for Random Forest:", grid_search.best_params_)

# Final model training with best parameters
best_model = RandomForestClassifier(**grid_search.best_params_)
best_model.fit(X_train, y_train)

# Save the model
joblib.dump(best_model, "best_model.pkl")

print("\nFinal model saved as best_model.pkl")


Random Forest Model Performance:
              precision    recall  f1-score   support

          -1       0.99      0.94      0.96       306
           1       0.94      0.99      0.96       280

    accuracy                           0.96       586
   macro avg       0.96      0.97      0.96       586
weighted avg       0.97      0.96      0.96       586

Accuracy: 0.9641638225255973

SVM Model Performance:
              precision    recall  f1-score   support

          -1       0.99      0.75      0.86       306
           1       0.79      0.99      0.88       280

    accuracy                           0.87       586
   macro avg       0.89      0.87      0.87       586
weighted avg       0.89      0.87      0.87       586

Accuracy: 0.8668941979522184

Naive Bayes Model Performance:
              precision    recall  f1-score   support

          -1       0.75      0.13      0.23       306
           1       0.50      0.95      0.66       280

    accuracy                      

### Conclusion

#####   
The implementation successfully preprocessed the semiconductor sensor data by handling missing values, balancing the dataset using SMOTE, and applying feature scaling. Three machine learning models—Random Forest, SVM, and Naive Bayes—were trained and evaluated, with performance measured using classification reports and accuracy scores. Hyperparameter tuning was conducted on the Random Forest model, identifying the best parameters for improved classification accuracy. The final optimized model was saved for future predictions, ensuring a robust approach for semiconductor yield classification.