In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tqdm import tqdm  # Import tqdm for progress bar

# Load your dataset
# Assuming df is your DataFrame with features and target columns
df = pd.read_csv(r"D:\Intellipaat\kegal datasets\steel plates dataset\train.csv")

test_data= pd.read_csv(r"D:\Intellipaat\kegal datasets\steel plates dataset\test.csv")
test_data
# Assuming your target columns are: 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'
target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# Split the data into features (X) and target (y)
X = df.drop(target_columns, axis=1)  # Features
y = df[target_columns]  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize empty dictionaries to store models for each target column
xgb_models = {}
lgb_models = {}
catboost_models = {}

# Train models for each target column
for target_column in tqdm(target_columns, desc="Training Models"):  # tqdm for progress bar
    # Train XGBoost model
    xgb_model = XGBClassifier()
    xgb_model.fit(X_train, y_train[target_column])
    xgb_models[target_column] = xgb_model
    
    # Train LightGBM model
    lgb_model = LGBMClassifier()
    lgb_model.fit(X_train, y_train[target_column])
    lgb_models[target_column] = lgb_model
    
    # Train CatBoost model
    catboost_model = CatBoostClassifier()
    catboost_model.fit(X_train, y_train[target_column])
    catboost_models[target_column] = catboost_model

# Generate predictions for validation data
xgb_val_predictions = pd.DataFrame()
lgb_val_predictions = pd.DataFrame()
catboost_val_predictions = pd.DataFrame()

for target_column in target_columns:
    xgb_val_predictions[target_column] = xgb_models[target_column].predict_proba(X_val)[:, 1]
    lgb_val_predictions[target_column] = lgb_models[target_column].predict_proba(X_val)[:, 1]
    catboost_val_predictions[target_column] = catboost_models[target_column].predict_proba(X_val)[:, 1]

# Combine predictions from all models for validation data
# Here, you can choose different combination strategies (e.g., averaging probabilities)
# For simplicity, let's average probabilities
combined_val_predictions = (xgb_val_predictions + lgb_val_predictions + catboost_val_predictions) / 3.0

# Example of creating submission file (you'll need to adapt this to your specific competition requirements)
# Assuming you have test data stored in a DataFrame named 'test_data'
# Generate predictions for test data
xgb_test_predictions = pd.DataFrame()
lgb_test_predictions = pd.DataFrame()
catboost_test_predictions = pd.DataFrame()

for target_column in target_columns:
    xgb_test_predictions[target_column] = xgb_models[target_column].predict_proba(test_data)[:, 1]
    lgb_test_predictions[target_column] = lgb_models[target_column].predict_proba(test_data)[:, 1]
    catboost_test_predictions[target_column] = catboost_models[target_column].predict_proba(test_data)[:, 1]

# Combine predictions from all models for test data
# Here, you can choose different combination strategies (e.g., averaging probabilities)
# For simplicity, let's average probabilities
combined_test_predictions = (xgb_test_predictions + lgb_test_predictions + catboost_test_predictions) / 3.0

# Create submission DataFrame
submission_df = pd.DataFrame(combined_test_predictions, columns=target_columns)
# Assuming you have an index or ID column in your test data
submission_df['ID'] = test_data['id']
# Reorder columns to have 'ID' as the first column
submission_df = submission_df[['ID'] + target_columns]



Training Models:   0%|          | 0/7 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 1165, number of negative: 14210
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.075772 -> initscore=-2.501225
[LightGBM] [Info] Start training from score -2.501225
Learning rate set to 0.033091
0:	learn: 0.6455379	total: 157ms	remaining: 2m 36s
1:	learn: 0.6022716	total: 192ms	remaining: 1m 35s
2:	learn: 0.5612604	total: 222ms	remaining: 1m 13s
3:	learn: 0.5260324	total: 239ms	remaining: 59.5s
4:	learn: 0.5009260	total: 254ms	remaining: 50.5s
5:	learn: 0.4770273	total: 268ms	remaining: 44.3s
6:	learn: 0.4554034	total: 278ms	remaining: 39.4s
7:	learn: 0.4358397	total: 288ms	remaining: 35.7s
8:	learn: 0.4172865	total: 299ms	remaining: 32.9s
9:	learn: 0.3957289	tota

Training Models:  14%|█▍        | 1/7 [00:11<01:10, 11.81s/it]

992:	learn: 0.1068663	total: 10.1s	remaining: 71ms
993:	learn: 0.1067752	total: 10.1s	remaining: 60.9ms
994:	learn: 0.1067392	total: 10.1s	remaining: 50.7ms
995:	learn: 0.1066619	total: 10.1s	remaining: 40.6ms
996:	learn: 0.1066056	total: 10.1s	remaining: 30.4ms
997:	learn: 0.1065120	total: 10.1s	remaining: 20.3ms
998:	learn: 0.1064278	total: 10.1s	remaining: 10.1ms
999:	learn: 0.1063504	total: 10.1s	remaining: 0us
[LightGBM] [Info] Number of positive: 930, number of negative: 14445
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.060488 -> initscore=-2.742919
[LightGBM] [Info] Start training from score -2.742919
Learning rate set to 0.033091
0:	learn: 0.6311345	total: 10.6ms	remaining: 10.6s
1:	learn

Training Models:  29%|██▊       | 2/7 [00:22<00:55, 11.07s/it]

992:	learn: 0.0448959	total: 9.73s	remaining: 68.6ms
993:	learn: 0.0448595	total: 9.74s	remaining: 58.8ms
994:	learn: 0.0448267	total: 9.76s	remaining: 49ms
995:	learn: 0.0447840	total: 9.77s	remaining: 39.2ms
996:	learn: 0.0447576	total: 9.78s	remaining: 29.4ms
997:	learn: 0.0447338	total: 9.79s	remaining: 19.6ms
998:	learn: 0.0446892	total: 9.8s	remaining: 9.81ms
999:	learn: 0.0446640	total: 9.82s	remaining: 0us
[LightGBM] [Info] Number of positive: 2750, number of negative: 12625
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.178862 -> initscore=-1.524078
[LightGBM] [Info] Start training from score -1.524078
Learning rate set to 0.033091
0:	learn: 0.6245499	total: 14.8ms	remaining: 14.8s
1:	learn

Training Models:  43%|████▎     | 3/7 [00:35<00:48, 12.24s/it]

996:	learn: 0.0531191	total: 12.6s	remaining: 37.9ms
997:	learn: 0.0530956	total: 12.6s	remaining: 25.3ms
998:	learn: 0.0530732	total: 12.6s	remaining: 12.6ms
999:	learn: 0.0530658	total: 12.6s	remaining: 0us
[LightGBM] [Info] Number of positive: 468, number of negative: 14907
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030439 -> initscore=-3.461118
[LightGBM] [Info] Start training from score -3.461118
Learning rate set to 0.033091
0:	learn: 0.5909414	total: 14.2ms	remaining: 14.2s
1:	learn: 0.5267493	total: 27.8ms	remaining: 13.9s
2:	learn: 0.4635080	total: 41.6ms	remaining: 13.8s
3:	learn: 0.4143571	total: 54.5ms	remaining: 13.6s
4:	learn: 0.3693796	total: 69.6ms	remaining: 13.9s
5:	learn: 0.32

Training Models:  57%|█████▋    | 4/7 [00:49<00:37, 12.65s/it]

[LightGBM] [Info] Number of positive: 400, number of negative: 14975
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.026016 -> initscore=-3.622673
[LightGBM] [Info] Start training from score -3.622673
Learning rate set to 0.033091
0:	learn: 0.6297463	total: 16.2ms	remaining: 16.2s
1:	learn: 0.5751178	total: 29.4ms	remaining: 14.7s
2:	learn: 0.5253052	total: 43ms	remaining: 14.3s
3:	learn: 0.4808474	total: 56.9ms	remaining: 14.2s
4:	learn: 0.4418074	total: 70.2ms	remaining: 14s
5:	learn: 0.4046518	total: 82.5ms	remaining: 13.7s
6:	learn: 0.3756643	total: 94.9ms	remaining: 13.5s
7:	learn: 0.3487499	total: 108ms	remaining: 13.4s
8:	learn: 0.3244015	total: 119ms	remaining: 13.1s
9:	learn: 0.2997835	total

Training Models:  71%|███████▏  | 5/7 [01:02<00:25, 12.85s/it]

998:	learn: 0.0337672	total: 12.2s	remaining: 12.2ms
999:	learn: 0.0337372	total: 12.2s	remaining: 0us
[LightGBM] [Info] Number of positive: 3837, number of negative: 11538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249561 -> initscore=-1.100955
[LightGBM] [Info] Start training from score -1.100955
Learning rate set to 0.033091
0:	learn: 0.6717927	total: 15.8ms	remaining: 15.8s
1:	learn: 0.6510271	total: 30.3ms	remaining: 15.1s
2:	learn: 0.6327203	total: 44.6ms	remaining: 14.8s
3:	learn: 0.6159106	total: 57.7ms	remaining: 14.4s
4:	learn: 0.6050636	total: 70.3ms	remaining: 14s
5:	learn: 0.5922630	total: 82.3ms	remaining: 13.6s
6:	learn: 0.5820080	total: 94.8ms	remaining: 13.4s
7:	learn: 0.5722819

Training Models:  86%|████████▌ | 6/7 [01:15<00:13, 13.05s/it]

990:	learn: 0.3124786	total: 12.2s	remaining: 111ms
991:	learn: 0.3123942	total: 12.3s	remaining: 98.8ms
992:	learn: 0.3123248	total: 12.3s	remaining: 86.5ms
993:	learn: 0.3121704	total: 12.3s	remaining: 74.1ms
994:	learn: 0.3120659	total: 12.3s	remaining: 61.8ms
995:	learn: 0.3119313	total: 12.3s	remaining: 49.4ms
996:	learn: 0.3118234	total: 12.3s	remaining: 37.1ms
997:	learn: 0.3116849	total: 12.3s	remaining: 24.7ms
998:	learn: 0.3115442	total: 12.3s	remaining: 12.3ms
999:	learn: 0.3114774	total: 12.3s	remaining: 0us
[LightGBM] [Info] Number of positive: 5209, number of negative: 10166
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5449
[LightGBM] [Info] Number of data points in the train set: 15375, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.338797 -> initscore=-0.668661
[LightGBM] [Info] Start training fr

Training Models: 100%|██████████| 7/7 [01:29<00:00, 12.76s/it]


In [2]:

# Save submission file
submission_df.to_csv('submission.csv', index=False)

In [3]:
submission_df

Unnamed: 0,ID,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.320968,0.000684,0.000972,0.000008,0.031235,0.115508,0.434884
1,19220,0.099013,0.005066,0.004977,0.000020,0.089626,0.115031,0.386345
2,19221,0.000519,0.018437,0.016055,0.000328,0.009465,0.236196,0.398557
3,19222,0.095985,0.000278,0.000104,0.000275,0.011085,0.449967,0.380043
4,19223,0.000826,0.000146,0.000430,0.000193,0.006470,0.602126,0.334234
...,...,...,...,...,...,...,...,...
12809,32028,0.030042,0.095020,0.000492,0.000007,0.003944,0.260297,0.278808
12810,32029,0.101145,0.000724,0.009197,0.001759,0.069925,0.062730,0.353903
12811,32030,0.000097,0.000096,0.919613,0.000009,0.000082,0.000394,0.106074
12812,32031,0.156922,0.001845,0.004833,0.000006,0.028704,0.128839,0.303340


In [2]:
submission_df

Unnamed: 0,ID,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,0.320968,0.000684,0.000972,0.000008,0.031235,0.115508,0.434884
1,1,0.099013,0.005066,0.004977,0.000020,0.089626,0.115031,0.386345
2,2,0.000519,0.018437,0.016055,0.000328,0.009465,0.236196,0.398557
3,3,0.095985,0.000278,0.000104,0.000275,0.011085,0.449967,0.380043
4,4,0.000826,0.000146,0.000430,0.000193,0.006470,0.602126,0.334234
...,...,...,...,...,...,...,...,...
12809,12809,0.030042,0.095020,0.000492,0.000007,0.003944,0.260297,0.278808
12810,12810,0.101145,0.000724,0.009197,0.001759,0.069925,0.062730,0.353903
12811,12811,0.000097,0.000096,0.919613,0.000009,0.000082,0.000394,0.106074
12812,12812,0.156922,0.001845,0.004833,0.000006,0.028704,0.128839,0.303340


In [3]:
test_data

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,...,0.0095,0.5652,1.0000,1.0,2.8410,1.1139,1.6628,0.6727,-0.2261,0.9172
1,19220,1257,1271,419960,419973,370,26,28,39293,92,...,0.0047,0.2414,1.0000,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,19221,1358,1372,117715,117724,289,36,32,29386,101,...,0.0155,0.6000,0.7500,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,19222,158,168,232415,232440,80,10,11,8586,107,...,0.0037,0.8000,1.0000,1.0,1.9031,0.6990,1.0414,0.1818,-0.0738,0.2051
4,19223,559,592,544375,544389,140,19,15,15524,103,...,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.4170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12809,32028,1101,1116,447943,447992,313,32,37,21603,79,...,0.0126,0.4063,0.9194,1.0,2.4955,1.2305,1.6335,0.7661,-0.3109,0.8894
12810,32029,1289,1306,3149494,3149542,59,9,18,5249,113,...,0.0052,0.7778,1.0000,1.0,1.7708,0.8451,1.2553,0.7222,-0.0448,0.1954
12811,32030,41,210,1587535,1587191,16584,796,522,1858162,24,...,0.1236,0.2199,0.4097,0.0,4.2525,2.2504,2.2672,-0.0629,-0.0801,1.0000
12812,32031,1329,1340,702237,702267,386,43,34,36875,66,...,0.0095,0.2407,1.0000,1.0,2.5866,1.1139,1.5911,0.8461,-0.2629,0.7844


In [6]:
test_data['id']

0        19219
1        19220
2        19221
3        19222
4        19223
         ...  
12809    32028
12810    32029
12811    32030
12812    32031
12813    32032
Name: id, Length: 12814, dtype: int64

In [10]:
submission_df

Unnamed: 0,ID,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,0.320968,0.000684,0.000972,0.000008,0.031235,0.115508,0.434884
1,1,0.099013,0.005066,0.004977,0.000020,0.089626,0.115031,0.386345
2,2,0.000519,0.018437,0.016055,0.000328,0.009465,0.236196,0.398557
3,3,0.095985,0.000278,0.000104,0.000275,0.011085,0.449967,0.380043
4,4,0.000826,0.000146,0.000430,0.000193,0.006470,0.602126,0.334234
...,...,...,...,...,...,...,...,...
12809,12809,0.030042,0.095020,0.000492,0.000007,0.003944,0.260297,0.278808
12810,12810,0.101145,0.000724,0.009197,0.001759,0.069925,0.062730,0.353903
12811,12811,0.000097,0.000096,0.919613,0.000009,0.000082,0.000394,0.106074
12812,12812,0.156922,0.001845,0.004833,0.000006,0.028704,0.128839,0.303340


In [11]:
submission_df['ID'] = test_data['id']

In [2]:
submission_df

NameError: name 'submission_df' is not defined

In [1]:
submission_df.to_csv('subbmission in differnt logic.csv', index=False)

NameError: name 'submission_df' is not defined