In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [8]:
# Step 1: Load Data
try:
    df = pd.read_csv("Part- 123 - Signal.csv")
    print("Dataset Shape:", df.shape)
except FileNotFoundError:
    print("Error: File not found. Please check the file path.")
    exit()

# Step 2: Data Cleaning
# Drop columns with >20% missing values
df.dropna(axis=1, thresh=0.8 * len(df), inplace=True)
# Drop duplicate rows
df.drop_duplicates(inplace=True)
# Remove irrelevant columns (Assume first column is timestamp)
df.drop(columns=[df.columns[0]], inplace=True)  # Adjust if needed


Dataset Shape: (1599, 12)


In [9]:
# Step 3 : Preprocessing
X = df.drop(columns=['Parameter 2']) # Predictors
y = df['Signal_Strength'] # Target variable
# Handle class imbalance using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#Step 4 : Model Training And Hyper Training 
models = {
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(),
    "NaiveBayes": GaussianNB()
}

params = {
    "RandomForest": {"n_estimators": [100, 200], "max_depth": [10, 30, None]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "NaiveBayes": {}
}

best_models = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    if params[model_name]:
        grid = GridSearchCV(model, params[model_name], cv=5, n_jobs=-1, scoring='accuracy')
        grid.fit(X_train, y_train)
        best_models[model_name] = grid.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_models[model_name] = model

Training RandomForest...
Training SVM...
Training NaiveBayes...


In [11]:
# Step 5: Model Evaluation
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


RandomForest Classification Report:
              precision    recall  f1-score   support

           3       1.00      1.00      1.00       122
           4       1.00      1.00      1.00        98
           5       1.00      1.00      1.00       128
           6       1.00      1.00      1.00       115
           7       1.00      1.00      1.00       120
           8       1.00      1.00      1.00       110

    accuracy                           1.00       693
   macro avg       1.00      1.00      1.00       693
weighted avg       1.00      1.00      1.00       693

Accuracy: 1.0000

SVM Classification Report:
              precision    recall  f1-score   support

           3       1.00      1.00      1.00       122
           4       1.00      1.00      1.00        98
           5       1.00      1.00      1.00       128
           6       1.00      1.00      1.00       115
           7       1.00      1.00      1.00       120
           8       1.00      1.00      1.00       

In [12]:
# Step 6: Save the Best Model
best_model = max(best_models.items(), key=lambda x: accuracy_score(y_test, x[1].predict(X_test)))[1]
joblib.dump(best_model, "best_semiconductor_model.pkl")
print("Best Model Saved!")


Best Model Saved!
