In [1]:
import pandas as pd
from sklearn.datasets import make_classification

# Generate synthetic dataset
X, y = make_classification(
    n_samples=1500,          # total samples
    n_features=12,           # total features
    n_informative=8,         # informative features (useful for prediction)
    n_redundant=2,           # redundant (correlated) features
    n_repeated=0,            # no repeated features
    n_classes=2,             # binary classification (fault = 1, no fault = 0)
    weights=[0.7, 0.3],      # class imbalance (70% no fault, 30% fault)
    flip_y=0.01,             # 1% label noise
    class_sep=1.0,           # how well the classes are separated
    random_state=42          # for reproducibility
)

In [2]:
# Feature column names
feature_names = [
    'Length', 'Width', 'Area', 'Perimeter', 'Roughness', 
    'TextureVariance', 'Compactness', 'Symmetry', 'Curvature', 'Elongation','SurfaceDefectDensity', 'EdgeSharpness'
]
# Create DataFrame
df = pd.DataFrame(X, columns=feature_names)
df['fault'] = y  # target column

In [3]:
# Save to CSV
df.to_csv("steel_fault_dummy.csv", index=False)

print("✅ Dummy dataset 'steel_fault_dummy.csv' created successfully!")
print(df.head())

✅ Dummy dataset 'steel_fault_dummy.csv' created successfully!
     Length     Width      Area  Perimeter  Roughness  TextureVariance  \
0  3.496609 -1.270259  0.529972  -0.384952   2.053614         0.215681   
1  2.951283  2.160676  6.189334  -0.573813   2.349570         1.549859   
2 -2.473218  0.290771 -2.738460  -0.362128  -2.182197        -0.118400   
3 -0.952944 -0.940293 -0.218714  -0.454177  -1.016542         1.327468   
4  0.966662  1.310294  0.417691   0.594426   0.665263        -0.583076   

   Compactness  Symmetry  Curvature  Elongation  SurfaceDefectDensity  \
0     0.939044  2.635382  -0.008898   -0.879493             -1.819071   
1    -0.249781  1.618216   1.718678    0.589494             -2.229935   
2     0.172263  1.475251  -1.980732   -1.227980              1.176121   
3    -1.438674 -1.886388  -1.760103    1.247055              2.005955   
4    -0.460143  0.881564  -2.617222    0.091832              0.537421   

   EdgeSharpness  fault  
0       1.964841      0  
1 

In [4]:
df

Unnamed: 0,Length,Width,Area,Perimeter,Roughness,TextureVariance,Compactness,Symmetry,Curvature,Elongation,SurfaceDefectDensity,EdgeSharpness,fault
0,3.496609,-1.270259,0.529972,-0.384952,2.053614,0.215681,0.939044,2.635382,-0.008898,-0.879493,-1.819071,1.964841,0
1,2.951283,2.160676,6.189334,-0.573813,2.349570,1.549859,-0.249781,1.618216,1.718678,0.589494,-2.229935,-3.204863,0
2,-2.473218,0.290771,-2.738460,-0.362128,-2.182197,-0.118400,0.172263,1.475251,-1.980732,-1.227980,1.176121,-1.459065,1
3,-0.952944,-0.940293,-0.218714,-0.454177,-1.016542,1.327468,-1.438674,-1.886388,-1.760103,1.247055,2.005955,0.127508,0
4,0.966662,1.310294,0.417691,0.594426,0.665263,-0.583076,-0.460143,0.881564,-2.617222,0.091832,0.537421,-2.218234,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,-0.762373,3.090205,1.763346,-3.954423,-1.960348,0.063145,1.878302,-0.426908,0.119506,-0.509935,-0.633976,-6.623535,1
1496,-0.189877,0.388921,2.410739,1.386769,0.779555,-0.294010,1.674261,3.138209,-3.771765,0.691574,1.215391,-4.162576,0
1497,1.195753,0.091989,1.145864,-2.497277,2.622898,0.291949,-1.018197,-2.422908,-0.759474,-0.709191,-1.184569,0.495830,0
1498,-3.461225,0.341892,-2.695544,-1.621285,-1.174794,-2.848224,-0.252544,0.204660,-1.047279,0.110917,-0.059892,-2.658756,1


In [5]:
df.describe()

Unnamed: 0,Length,Width,Area,Perimeter,Roughness,TextureVariance,Compactness,Symmetry,Curvature,Elongation,SurfaceDefectDensity,EdgeSharpness,fault
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,-0.359339,1.011497,0.0398,0.058382,0.681539,-0.239937,0.042895,0.30188,-1.038317,0.007474,-0.137839,-0.976582,0.302667
std,1.753961,1.354771,1.909942,1.948001,1.794812,1.754039,0.993621,1.854563,1.691539,0.943519,1.864386,3.023882,0.459565
min,-6.526151,-4.950714,-6.060847,-5.873632,-6.139272,-4.938239,-3.199761,-6.179889,-6.395075,-2.930931,-6.496678,-12.46777,0.0
25%,-1.5621,0.174381,-1.265247,-1.285758,-0.495099,-1.440193,-0.633816,-1.001641,-2.18202,-0.643207,-1.418316,-3.104064,0.0
50%,-0.466735,0.993205,0.026769,-0.001868,0.749002,-0.294552,0.014039,0.334183,-1.043439,-0.001831,-0.12918,-0.819999,0.0
75%,0.725304,1.836043,1.360876,1.359489,1.935585,0.918271,0.705438,1.56523,0.062197,0.614442,1.145021,1.235563,1.0
max,5.961125,6.178169,6.189334,6.950465,6.229819,5.72235,3.21602,5.843863,4.445546,3.184307,5.258581,8.667388,1.0


In [6]:
df_sorted = df.sort_values(by=list(df.columns), ascending=True)
df_sorted

Unnamed: 0,Length,Width,Area,Perimeter,Roughness,TextureVariance,Compactness,Symmetry,Curvature,Elongation,SurfaceDefectDensity,EdgeSharpness,fault
1313,-6.526151,0.876442,0.375599,-0.690215,0.768972,-1.322922,0.292524,4.624033,1.161390,0.352741,-2.384572,-3.332713,1
1165,-5.596862,0.243703,0.950968,2.078631,1.787451,-1.469545,-0.990644,4.516578,2.450933,1.172632,-2.313345,-0.764679,1
492,-5.322333,0.308094,-0.527440,2.004445,3.149643,-2.472514,-0.533031,5.437559,2.451820,-1.482237,-3.651384,0.660824,1
1042,-5.163960,0.121954,2.786858,-1.114556,-0.363595,-2.377810,1.053695,3.867880,-0.935010,-0.946018,-0.408585,-7.446492,1
515,-5.063999,0.305127,5.014155,3.175720,-1.259581,-1.280789,0.023086,4.197229,-2.724976,-0.416132,2.766757,-8.667598,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,4.927992,0.603110,1.922770,-0.700083,-0.924514,2.306646,-0.965369,1.651673,1.712724,-0.375666,-0.915850,0.013784,0
1202,5.030796,1.820657,1.700101,-0.397142,0.481316,1.985258,0.240477,1.689475,0.998704,-0.596426,-1.531882,0.058637,0
29,5.374255,-0.705854,0.518006,-0.172516,0.201273,1.780889,-0.186384,3.438349,0.758954,1.259198,-1.412039,1.995660,0
440,5.500725,1.867220,6.102638,2.836561,-0.128912,3.902424,-1.244181,1.632359,-0.566133,-0.287560,1.497477,-2.395470,0


In [7]:
# check class balance
print( df["fault"].value_counts())

fault
0    1046
1     454
Name: count, dtype: int64


In [8]:
# preprocessing.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

def preprocess_data(path="steel_fault_dummy.csv"):
    df = pd.read_csv(path)
    X = df.drop("fault", axis=1)
    y = df["fault"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    from sklearn.preprocessing import RobustScaler
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled=scaler.transform(X_test)

    # SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    return X_train_resampled, X_test_scaled, y_train_resampled, y_test


In [9]:
# model_training.py
from xgboost import XGBClassifier
import joblib

X_train, X_test, y_train, y_test = preprocess_data()


xgb = XGBClassifier( eval_metric="logloss", random_state=42)


xgb.fit(X_train, y_train)
joblib.dump(xgb, "xgb_model.pkl")

print("✅ Models trained and saved successfully.")



✅ Models trained and saved successfully.


In [10]:
xgb.score(X_test,y_test)

0.9233333333333333

In [11]:
# evaluation.py

import joblib
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = preprocess_data()

models = {
    "XGBoost": joblib.load("xgb_model.pkl"),
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=4))


[[197  12]
 [ 11  80]]
              precision    recall  f1-score   support

           0     0.9471    0.9426    0.9448       209
           1     0.8696    0.8791    0.8743        91

    accuracy                         0.9233       300
   macro avg     0.9083    0.9109    0.9096       300
weighted avg     0.9236    0.9233    0.9235       300



In [12]:
# After StandardScaler is fit
import joblib
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
joblib.dump(scaler, "scaler.pkl")  # Save this

['scaler.pkl']

In [13]:
import os
print("Current Working Directory:", os.getcwd())


Current Working Directory: C:\Users\MY  PC\Documents\project


In [14]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import joblib
import pandas as pd

# Load model and scaler
model = joblib.load("xgb_model.pkl")
scaler = joblib.load("scaler.pkl")

# Input data0
input_data={
  "Length": -2.47,
  "Width": 0.29,
  "Area": -2.37,
  "Perimeter": -0.362128  ,
  "Roughness": -2.18,
  "TextureVariance": -0.20,
  "Compactness": 0.172263 , 
  "Symmetry": 1.47,
  "Curvature": -1.98,
  "Elongation": -1.22,
  "SurfaceDefectDensity": 1.17,
  "EdgeSharpness": -1.45
}


# Convert to DataFrame
df = pd.DataFrame([input_data])

# Scale input
scaled_input = scaler.transform(df)

# Predict
proba = model.predict_proba(scaled_input)[0][1]  # probability of class 1 (fault)
print(f"⚠️ Probability of Fault: {proba:.2f}")
prediction = int(proba > 0.4)  # tune the thres

# Output result
print("🔍 Prediction:", "Fault Detected" if prediction == 1 else "No Fault")


⚠️ Probability of Fault: 1.00
🔍 Prediction: Fault Detected


In [18]:
import joblib
import pandas as pd

# Load model and scaler
model = joblib.load("xgb_model.pkl")
scaler = joblib.load("scaler.pkl")

test_inputs = {
        "Length": -5.3, "Width": 0.8, "Area": 5.07, "Perimeter": 4.2,
        "Roughness": -2.00, "TextureVariance": -4, "Compactness": 0.03,
        "Symmetry": 5.12, "Curvature": -3.11, "Elongation": 0.41,
        "SurfaceDefectDensity": 2.76, "EdgeSharpness": -9.11 
    }#-5.063999	0.305127	5.014155	3.175720	-1.259581	-1.280789	0.023086	4.197229	-2.724976	-0.416132	2.766757	-8.667598	1
# Convert to DataFrame
df = pd.DataFrame([test_inputs])

# Scale input
scaled_input = scaler.transform(df)

# Predict
proba = model.predict_proba(scaled_input)[0][1]  # probability of class 1 (fault)
#print(f"⚠️ Probability of Fault: {proba:.2f}")
#prediction = int(proba > 0.4)  # tune the thres

# Output result
print("🔍 Prediction:", "Fault Detected" if prediction == 1 else "No Fault")


🔍 Prediction: Fault Detected
