In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tqdm import tqdm  # Import tqdm for progress bar

# Load your dataset
# Assuming df is your DataFrame with features and target columns
df = pd.read_csv(r"D:\Intellipaat\kegal datasets\steel plates dataset\train.csv")

test_data= pd.read_csv(r"D:\Intellipaat\kegal datasets\steel plates dataset\test.csv")
test_data
# Assuming your target columns are: 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'
target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# Split the data into features (X) and target (y)
X = df.drop(target_columns, axis=1)  # Features
y = df[target_columns]  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize empty dictionaries to store models for each target column
xgb_models = {}
lgb_models = {}
catboost_models = {}

# Train models for each target column
for target_column in tqdm(target_columns, desc="Training Models"):  # tqdm for progress bar
    # Train XGBoost model
    xgb_model = XGBClassifier()
    xgb_model.fit(X_train, y_train[target_column])
    xgb_models[target_column] = xgb_model
    
    # Train LightGBM model
    lgb_model = LGBMClassifier()
    lgb_model.fit(X_train, y_train[target_column])
    lgb_models[target_column] = lgb_model
    
    # Train CatBoost model
    catboost_model = CatBoostClassifier()
    catboost_model.fit(X_train, y_train[target_column])
    catboost_models[target_column] = catboost_model

# Generate predictions for validation data
xgb_val_predictions = pd.DataFrame()
lgb_val_predictions = pd.DataFrame()
catboost_val_predictions = pd.DataFrame()

for target_column in target_columns:
    xgb_val_predictions[target_column] = xgb_models[target_column].predict_proba(X_val)[:, 1]
    lgb_val_predictions[target_column] = lgb_models[target_column].predict_proba(X_val)[:, 1]
    catboost_val_predictions[target_column] = catboost_models[target_column].predict_proba(X_val)[:, 1]

# Combine predictions from all models for validation data
# Here, you can choose different combination strategies (e.g., averaging probabilities)
# For simplicity, let's average probabilities
combined_val_predictions = (xgb_val_predictions + lgb_val_predictions + catboost_val_predictions) / 3.0

# Example of creating submission file (you'll need to adapt this to your specific competition requirements)
# Assuming you have test data stored in a DataFrame named 'test_data'
# Generate predictions for test data
xgb_test_predictions = pd.DataFrame()
lgb_test_predictions = pd.DataFrame()
catboost_test_predictions = pd.DataFrame()

for target_column in target_columns:
    xgb_test_predictions[target_column] = xgb_models[target_column].predict_proba(test_data)[:, 1]
    lgb_test_predictions[target_column] = lgb_models[target_column].predict_proba(test_data)[:, 1]
    catboost_test_predictions[target_column] = catboost_models[target_column].predict_proba(test_data)[:, 1]

# Combine predictions from all models for test data
# Here, you can choose different combination strategies (e.g., averaging probabilities)
# For simplicity, let's average probabilities
combined_test_predictions = (xgb_test_predictions + lgb_test_predictions + catboost_test_predictions) / 3.0

# Create submission DataFrame
submission_df = pd.DataFrame(combined_test_predictions, columns=target_columns)
# Assuming you have an index or ID column in your test data
submission_df['ID'] = test_data['id']
# Reorder columns to have 'ID' as the first column
submission_df = submission_df[['ID'] + target_columns]

# Save submission file
submission_df.to_csv('submission.csv', index=False)


Training Models:   0%|          | 0/7 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 1165, number of negative: 14210
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075772 -> initscore=-2.501225
[LightGBM] [Info] Start training from score -2.501225
Learning rate set to 0.033091
0:	learn: 0.6455379	total: 183ms	remaining: 3m 3s
1:	learn: 0.6022716	total: 204ms	remaining: 1m 41s
2:	learn: 0.5612604	total: 219ms	remaining: 1m 12s
3:	learn: 0.5260324	total: 230ms	remaining: 57.3s
4:	learn: 0.5009260	total: 241ms	remaining: 48s
5:	learn: 0.4770273	total: 252ms	remaining: 41.8s
6:	learn: 0.4554034	total: 260ms	remaining: 36.9s
7:	learn: 0.4358397	total: 268ms	remaining: 33.3s
8:	learn: 0.4172865	total: 276ms	remaining: 30.4s
9:	learn: 0.3957289	total: 

Training Models:  14%|█▍        | 1/7 [00:09<00:54,  9.12s/it]

995:	learn: 0.1066619	total: 8.36s	remaining: 33.6ms
996:	learn: 0.1066056	total: 8.37s	remaining: 25.2ms
997:	learn: 0.1065120	total: 8.38s	remaining: 16.8ms
998:	learn: 0.1064278	total: 8.38s	remaining: 8.39ms
999:	learn: 0.1063504	total: 8.39s	remaining: 0us
[LightGBM] [Info] Number of positive: 930, number of negative: 14445
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.060488 -> initscore=-2.742919
[LightGBM] [Info] Start training from score -2.742919
Learning rate set to 0.033091
0:	learn: 0.6311345	total: 8.8ms	remaining: 8.79s
1:	learn: 0.5893585	total: 16.3ms	remaining: 8.14s
2:	learn: 0.5416309	total: 24.6ms	remaining: 8.18s
3:	learn: 0.5001889	total: 32.4ms	remaining: 8.06s
4:	learn: 0.4

Training Models:  29%|██▊       | 2/7 [00:17<00:43,  8.65s/it]

990:	learn: 0.0449550	total: 7.59s	remaining: 69ms
991:	learn: 0.0449182	total: 7.6s	remaining: 61.3ms
992:	learn: 0.0448959	total: 7.61s	remaining: 53.6ms
993:	learn: 0.0448595	total: 7.61s	remaining: 46ms
994:	learn: 0.0448267	total: 7.64s	remaining: 38.4ms
995:	learn: 0.0447840	total: 7.66s	remaining: 30.8ms
996:	learn: 0.0447576	total: 7.67s	remaining: 23.1ms
997:	learn: 0.0447338	total: 7.68s	remaining: 15.4ms
998:	learn: 0.0446892	total: 7.69s	remaining: 7.7ms
999:	learn: 0.0446640	total: 7.7s	remaining: 0us
[LightGBM] [Info] Number of positive: 2750, number of negative: 12625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.178862 -> initscore=-1.524078
[LightGBM] [Info] Start training from sco

Training Models:  43%|████▎     | 3/7 [00:24<00:32,  8.13s/it]

977:	learn: 0.0540041	total: 6.77s	remaining: 152ms
978:	learn: 0.0539756	total: 6.78s	remaining: 145ms
979:	learn: 0.0538844	total: 6.79s	remaining: 139ms
980:	learn: 0.0537993	total: 6.79s	remaining: 132ms
981:	learn: 0.0537854	total: 6.8s	remaining: 125ms
982:	learn: 0.0537332	total: 6.81s	remaining: 118ms
983:	learn: 0.0536834	total: 6.81s	remaining: 111ms
984:	learn: 0.0536035	total: 6.82s	remaining: 104ms
985:	learn: 0.0535478	total: 6.83s	remaining: 96.9ms
986:	learn: 0.0535008	total: 6.83s	remaining: 90ms
987:	learn: 0.0534715	total: 6.84s	remaining: 83.1ms
988:	learn: 0.0534486	total: 6.85s	remaining: 76.2ms
989:	learn: 0.0533717	total: 6.85s	remaining: 69.2ms
990:	learn: 0.0532825	total: 6.86s	remaining: 62.3ms
991:	learn: 0.0532387	total: 6.87s	remaining: 55.4ms
992:	learn: 0.0532046	total: 6.87s	remaining: 48.5ms
993:	learn: 0.0531950	total: 6.88s	remaining: 41.5ms
994:	learn: 0.0531711	total: 6.89s	remaining: 34.6ms
995:	learn: 0.0531485	total: 6.89s	remaining: 27.7ms
996:

Training Models:  57%|█████▋    | 4/7 [00:32<00:23,  7.91s/it]

993:	learn: 0.0054722	total: 6.98s	remaining: 42.1ms
994:	learn: 0.0054690	total: 6.98s	remaining: 35.1ms
995:	learn: 0.0054662	total: 6.99s	remaining: 28.1ms
996:	learn: 0.0054585	total: 7s	remaining: 21.1ms
997:	learn: 0.0054522	total: 7s	remaining: 14ms
998:	learn: 0.0054333	total: 7.01s	remaining: 7.02ms
999:	learn: 0.0054249	total: 7.02s	remaining: 0us
[LightGBM] [Info] Number of positive: 400, number of negative: 14975
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026016 -> initscore=-3.622673
[LightGBM] [Info] Start training from score -3.622673
Learning rate set to 0.033091
0:	learn: 0.6297463	total: 8.19ms	remaining: 8.18s
1:	learn: 0.5751178	total: 15.9ms	remaining: 7.93s
2:	learn: 0.5253

Training Models:  71%|███████▏  | 5/7 [00:42<00:16,  8.48s/it]

[LightGBM] [Info] Number of positive: 3837, number of negative: 11538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001394 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249561 -> initscore=-1.100955
[LightGBM] [Info] Start training from score -1.100955
Learning rate set to 0.033091
0:	learn: 0.6717927	total: 12.1ms	remaining: 12.1s
1:	learn: 0.6510271	total: 21.8ms	remaining: 10.9s
2:	learn: 0.6327203	total: 32.2ms	remaining: 10.7s
3:	learn: 0.6159106	total: 43.8ms	remaining: 10.9s
4:	learn: 0.6050636	total: 54.5ms	remaining: 10.8s
5:	learn: 0.5922630	total: 65.1ms	remaining: 10.8s
6:	learn: 0.5820080	total: 76.6ms	remaining: 10.9s
7:	learn: 0.5722819	total: 87.7ms	remaining: 10.9s
8:	learn: 0.5636324	total: 98.1ms	remaining: 10.8s
9:	learn: 0.557245

Training Models:  86%|████████▌ | 6/7 [00:53<00:09,  9.35s/it]

985:	learn: 0.3130442	total: 10.2s	remaining: 144ms
986:	learn: 0.3129036	total: 10.2s	remaining: 134ms
987:	learn: 0.3128018	total: 10.2s	remaining: 124ms
988:	learn: 0.3126986	total: 10.2s	remaining: 113ms
989:	learn: 0.3125754	total: 10.2s	remaining: 103ms
990:	learn: 0.3124786	total: 10.2s	remaining: 92.8ms
991:	learn: 0.3123942	total: 10.2s	remaining: 82.5ms
992:	learn: 0.3123248	total: 10.2s	remaining: 72.1ms
993:	learn: 0.3121704	total: 10.2s	remaining: 61.8ms
994:	learn: 0.3120659	total: 10.3s	remaining: 51.5ms
995:	learn: 0.3119313	total: 10.3s	remaining: 41.2ms
996:	learn: 0.3118234	total: 10.3s	remaining: 30.9ms
997:	learn: 0.3116849	total: 10.3s	remaining: 20.6ms
998:	learn: 0.3115442	total: 10.3s	remaining: 10.3ms
999:	learn: 0.3114774	total: 10.3s	remaining: 0us
[LightGBM] [Info] Number of positive: 5209, number of negative: 10166
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001883 seconds.
You can set `force_col_wise=true` to re

Training Models: 100%|██████████| 7/7 [01:04<00:00,  9.16s/it]

989:	learn: 0.4535997	total: 10.2s	remaining: 103ms
990:	learn: 0.4534867	total: 10.2s	remaining: 92.3ms
991:	learn: 0.4533899	total: 10.2s	remaining: 82.1ms
992:	learn: 0.4533464	total: 10.2s	remaining: 71.8ms
993:	learn: 0.4532559	total: 10.2s	remaining: 61.6ms
994:	learn: 0.4531379	total: 10.2s	remaining: 51.3ms
995:	learn: 0.4530525	total: 10.2s	remaining: 41.1ms
996:	learn: 0.4529547	total: 10.2s	remaining: 30.8ms
997:	learn: 0.4528454	total: 10.2s	remaining: 20.5ms
998:	learn: 0.4528230	total: 10.3s	remaining: 10.3ms
999:	learn: 0.4527033	total: 10.3s	remaining: 0us





In [2]:
submission_df

Unnamed: 0,ID,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,0.320968,0.000684,0.000972,0.000008,0.031235,0.115508,0.434884
1,1,0.099013,0.005066,0.004977,0.000020,0.089626,0.115031,0.386345
2,2,0.000519,0.018437,0.016055,0.000328,0.009465,0.236196,0.398557
3,3,0.095985,0.000278,0.000104,0.000275,0.011085,0.449967,0.380043
4,4,0.000826,0.000146,0.000430,0.000193,0.006470,0.602126,0.334234
...,...,...,...,...,...,...,...,...
12809,12809,0.030042,0.095020,0.000492,0.000007,0.003944,0.260297,0.278808
12810,12810,0.101145,0.000724,0.009197,0.001759,0.069925,0.062730,0.353903
12811,12811,0.000097,0.000096,0.919613,0.000009,0.000082,0.000394,0.106074
12812,12812,0.156922,0.001845,0.004833,0.000006,0.028704,0.128839,0.303340


In [3]:
test_data

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,...,0.0095,0.5652,1.0000,1.0,2.8410,1.1139,1.6628,0.6727,-0.2261,0.9172
1,19220,1257,1271,419960,419973,370,26,28,39293,92,...,0.0047,0.2414,1.0000,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,19221,1358,1372,117715,117724,289,36,32,29386,101,...,0.0155,0.6000,0.7500,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,19222,158,168,232415,232440,80,10,11,8586,107,...,0.0037,0.8000,1.0000,1.0,1.9031,0.6990,1.0414,0.1818,-0.0738,0.2051
4,19223,559,592,544375,544389,140,19,15,15524,103,...,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.4170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12809,32028,1101,1116,447943,447992,313,32,37,21603,79,...,0.0126,0.4063,0.9194,1.0,2.4955,1.2305,1.6335,0.7661,-0.3109,0.8894
12810,32029,1289,1306,3149494,3149542,59,9,18,5249,113,...,0.0052,0.7778,1.0000,1.0,1.7708,0.8451,1.2553,0.7222,-0.0448,0.1954
12811,32030,41,210,1587535,1587191,16584,796,522,1858162,24,...,0.1236,0.2199,0.4097,0.0,4.2525,2.2504,2.2672,-0.0629,-0.0801,1.0000
12812,32031,1329,1340,702237,702267,386,43,34,36875,66,...,0.0095,0.2407,1.0000,1.0,2.5866,1.1139,1.5911,0.8461,-0.2629,0.7844


In [6]:
test_data['id']

0        19219
1        19220
2        19221
3        19222
4        19223
         ...  
12809    32028
12810    32029
12811    32030
12812    32031
12813    32032
Name: id, Length: 12814, dtype: int64

In [10]:
submission_df

Unnamed: 0,ID,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,0.320968,0.000684,0.000972,0.000008,0.031235,0.115508,0.434884
1,1,0.099013,0.005066,0.004977,0.000020,0.089626,0.115031,0.386345
2,2,0.000519,0.018437,0.016055,0.000328,0.009465,0.236196,0.398557
3,3,0.095985,0.000278,0.000104,0.000275,0.011085,0.449967,0.380043
4,4,0.000826,0.000146,0.000430,0.000193,0.006470,0.602126,0.334234
...,...,...,...,...,...,...,...,...
12809,12809,0.030042,0.095020,0.000492,0.000007,0.003944,0.260297,0.278808
12810,12810,0.101145,0.000724,0.009197,0.001759,0.069925,0.062730,0.353903
12811,12811,0.000097,0.000096,0.919613,0.000009,0.000082,0.000394,0.106074
12812,12812,0.156922,0.001845,0.004833,0.000006,0.028704,0.128839,0.303340


In [11]:
submission_df['ID'] = test_data['id']

In [12]:
submission_df

Unnamed: 0,ID,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.320968,0.000684,0.000972,0.000008,0.031235,0.115508,0.434884
1,19220,0.099013,0.005066,0.004977,0.000020,0.089626,0.115031,0.386345
2,19221,0.000519,0.018437,0.016055,0.000328,0.009465,0.236196,0.398557
3,19222,0.095985,0.000278,0.000104,0.000275,0.011085,0.449967,0.380043
4,19223,0.000826,0.000146,0.000430,0.000193,0.006470,0.602126,0.334234
...,...,...,...,...,...,...,...,...
12809,32028,0.030042,0.095020,0.000492,0.000007,0.003944,0.260297,0.278808
12810,32029,0.101145,0.000724,0.009197,0.001759,0.069925,0.062730,0.353903
12811,32030,0.000097,0.000096,0.919613,0.000009,0.000082,0.000394,0.106074
12812,32031,0.156922,0.001845,0.004833,0.000006,0.028704,0.128839,0.303340


In [13]:
submission_df.to_csv('subbmission in differnt logic', index=False)