In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score


## Download the data

In [2]:
!kaggle competitions download -c playground-series-s3e17

^C


playground-series-s3e17.zip: Skipping, found more recently modified local copy (use --force to force download)


## Load data

In [111]:
data=pd.read_csv('train.csv')

In [112]:
data.shape

(136429, 14)

In [113]:
original_data=pd.read_csv('predictive_maintenance.csv')

In [114]:
data['Machine failure'].value_counts()

0    134281
1      2148
Name: Machine failure, dtype: int64

In [115]:
original_data.head(5)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [116]:
original_data['Target'].value_counts()

0    9661
1     339
Name: Target, dtype: int64

In [117]:
original_data['Failure Type'].value_counts()

No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: Failure Type, dtype: int64

In [118]:
# for each of the failure types, we will create a new column and assign 1 if the failure type is present and 0 if not

original_data['HDF']=np.where(original_data['Failure Type']=='Heat Dissipation Failure',1,0)
original_data['PWF']=np.where(original_data['Failure Type']=='Power Failure',1,0)
original_data['TWF']=np.where(original_data['Failure Type']=='Tool Wear Failure',1,0)
original_data['RNF']=np.where(original_data['Failure Type']=='Random Failures',1,0)
original_data['OSF']=np.where(original_data['Failure Type']=='Overstrain Failure',1,0)

## rename UDI to id
original_data.rename(columns={'UDI':'id'},inplace=True)
# rename target to machine failure
original_data.rename(columns={'Target':'Machine failure'},inplace=True)
# drop failure type
original_data.drop('Failure Type',axis=1,inplace=True)

In [119]:
original_data.head(5)

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,HDF,PWF,TWF,RNF,OSF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [120]:
data=pd.concat([data,original_data],axis = 0).reset_index(drop=True)

In [121]:
data.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [122]:
data.isna().sum()

id                         0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [123]:
data.shape

(146429, 14)

In [124]:
# from ydata_profiling import ProfileReport

# profile = ProfileReport(data, title='Pandas Profiling Report', html={'style':{'full_width':True}})
# profile.to_notebook_iframe()

In [125]:
data.head(5)

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [126]:
data=data.reset_index(drop=True).set_index(['id'])

In [127]:
data['Product ID'].value_counts()

L53257    140
L53271    132
L49056    132
L48892    121
L54275    121
         ... 
H31308      1
L48159      1
M15753      1
M14860      1
M24818      1
Name: Product ID, Length: 10000, dtype: int64

In [128]:
data.drop_duplicates(inplace=True)

In [129]:
data.duplicated().sum()

0

In [130]:
len(data)

145293

In [131]:
# data=data.drop('Product ID',axis=1)

In [132]:
data

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [133]:
len(data)

145293

In [134]:
data['Machine failure'].value_counts()

0    142827
1      2466
Name: Machine failure, dtype: int64

In [135]:
# len(pd.get_dummies(data).columns)

In [136]:
# ## Feature Importance

# from sklearn.ensemble import RandomForestClassifier 

# X_feat = data.drop(['Machine failure'],axis=1)
# y_feat = data['Machine failure']

# X_feat=pd.get_dummies(X_feat)
# model = RandomForestClassifier(n_jobs=-1)
# model.fit(X_feat,y_feat)

# feature_importances = pd.DataFrame(model.feature_importances_,
#                                       index = X_feat.columns,
#                                         columns=['importance']).sort_values('importance',ascending=False)
    

In [137]:
# feature_importances

In [138]:
# check the value of Machine failure where HDF is 1

data[data['HDF']==1]['Machine failure'].value_counts()

1    807
0      5
Name: Machine failure, dtype: int64

In [139]:
# check the value of Machine failure where HDF is 0

data[data['HDF']==0]['Machine failure'].value_counts()

0    142822
1      1659
Name: Machine failure, dtype: int64

In [140]:
# check the value of Machine failure where OSF is 1

data[data['OSF']==1]['Machine failure'].value_counts()

1    608
0      3
Name: Machine failure, dtype: int64

In [141]:
# check the value of Machine failure where OSF is 0

data[data['OSF']==0]['Machine failure'].value_counts()

0    142824
1      1858
Name: Machine failure, dtype: int64

In [142]:
# check the value of Machine failure where PWF is 1

data[data['PWF']==1]['Machine failure'].value_counts()

1    411
0      3
Name: Machine failure, dtype: int64

In [143]:
# check the value of Machine failure where PWF is 0

data[data['PWF']==0]['Machine failure'].value_counts()

0    142824
1      2055
Name: Machine failure, dtype: int64

In [144]:
# check the value of Machine failure where TWF is 1

data[data['TWF']==1]['Machine failure'].value_counts()

1    253
Name: Machine failure, dtype: int64

In [145]:
# check the value of Machine failure where TWF is 0

data[data['TWF']==0]['Machine failure'].value_counts()

0    142827
1      2213
Name: Machine failure, dtype: int64

TWF (tool wear failure), HDF (heat dissipation failure), PWF (power failure), OSF (overload failure), and RNF (random failure) variables. 

In [146]:
# check which of the features from TWF, PWF, OSF, HDF cause the most Machine failure

data[data['Machine failure']==1][['TWF','PWF','OSF','HDF']].sum()


TWF    253
PWF    411
OSF    608
HDF    807
dtype: int64

In [147]:
data.shape

(145293, 13)

## Preprocessing

In [248]:
def create_features(dataframe):
    dataframe['HDF_TWF_OSF_PWF']=dataframe['HDF']+dataframe['TWF']+dataframe['OSF']+dataframe['PWF']
    dataframe['Power']=dataframe['Torque [Nm]']*dataframe['Rotational speed [rpm]']
    dataframe['Temp/ratio [K]']=dataframe['Process temperature [K]']/dataframe['Air temperature [K]']
#     dataframe['Process temperature [C]']=dataframe['Process temperature [K]']-273.15
#     dataframe['Air temperature [C]']=dataframe['Air temperature [K]']-273.15
#    dataframe['Temp/ratio [C]']=dataframe['Process temperature [C]']/dataframe['Air temperature [C]']
    dataframe['Tool wear speed']=dataframe['Tool wear [min]']*dataframe['Rotational speed [rpm]']
    dataframe['Torque wear ratio']=dataframe['Torque [Nm]']/(dataframe['Tool wear [min]']+0.0001)
    dataframe['Torque speed ratio']=dataframe['Torque [Nm]']/dataframe['Rotational speed [rpm]']
    dataframe['Temperature difference']=dataframe['Process temperature [K]']-dataframe['Air temperature [K]']
    return dataframe

In [249]:
# # remove first letter from Product ID column

# data['Product ID']=data['Product ID'].str[1:]

In [250]:
data.head(5)

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [251]:
data['Product ID'].value_counts()

L53257    138
L49056    129
L48892    117
L54275    115
L53258    107
         ... 
H31308      1
L48159      1
M15753      1
H30273      1
M24818      1
Name: Product ID, Length: 10000, dtype: int64

In [252]:
X=data.drop(['Machine failure'],axis=1)
y=data['Machine failure']

In [253]:
# class weights
from sklearn.utils import class_weight
class_weights =class_weight.compute_class_weight('balanced',
                                                classes=np.unique(y),
                                                y=y)

# convert to dict
class_weights=dict(enumerate(class_weights))


In [254]:
class_weights

{0: 0.5086328215253418, 1: 29.459245742092456}

In [255]:
X=create_features(X)

### Feature Importance

In [256]:
from sklearn.ensemble import RandomForestClassifier
X_feat = X
y_feat = y

X_feat=pd.get_dummies(X_feat)
model = RandomForestClassifier(n_jobs=-1)
model.fit(X_feat,y_feat)

feature_importances = pd.DataFrame(model.feature_importances_,
                                      index = X_feat.columns,
                                        columns=['importance']).sort_values('importance',ascending=False)

In [257]:
feature_importances

In [258]:
X.columns, len(X.columns)

(Index(['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]',
        'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'TWF',
        'HDF', 'PWF', 'OSF', 'RNF', 'HDF_TWF_OSF_PWF', 'Power',
        'Temp/ratio [K]', 'Tool wear speed', 'Torque wear ratio',
        'Torque speed ratio', 'Temperature difference'],
       dtype='object'),
 19)

In [259]:
X.select_dtypes(include=['int64','float64']).columns

Index(['Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'TWF',
       'HDF', 'PWF', 'OSF', 'RNF', 'HDF_TWF_OSF_PWF', 'Power',
       'Temp/ratio [K]', 'Tool wear speed', 'Torque wear ratio',
       'Torque speed ratio', 'Temperature difference'],
      dtype='object')

In [260]:
bin_features=['TWF','PWF','OSF','HDF','RNF']

In [261]:
# numeric_features=['Air temperature [K]', 'Process temperature [K]',
#         'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]','Power',
#        'Temp/ratio [K]', 'Process temperature [C]', 'Air temperature [C]',
#        'Temp/ratio [C]', 'Tool wear speed', 'Torque wear ratio']
#'TWF', 'HDF', 'PWF', 'OSF','RNF','HDF_TWF_OSF_PWF'
categorical_features=['Product ID']
numeric_features=X.select_dtypes(include=['int64','float64']).columns.to_list()



In [265]:
y.value_counts()

0    142827
1      2466
Name: Machine failure, dtype: int64

In [266]:
numeric_features,categorical_features

(['Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]',
  'TWF',
  'HDF',
  'PWF',
  'OSF',
  'RNF',
  'HDF_TWF_OSF_PWF',
  'Power',
  'Temp/ratio [K]',
  'Tool wear speed',
  'Torque wear ratio',
  'Torque speed ratio',
  'Temperature difference'],
 ['Product ID'])

In [267]:
train_cat_X=X
train_cat_y=y

In [177]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler,RobustScaler,StandardScaler
from sklearn.decomposition import PCA

# pipeline=Pipeline([('scaler',StandardScaler()),
#                      ('pca',PCA(n_components=10,random_state=42))])


In [178]:
len(numeric_features),len(categorical_features),len(X.columns)

(9, 1, 10)

In [179]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_features),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=30000), categorical_features)],remainder='passthrough')


In [180]:
ct.fit(X)

X=ct.transform(X)

In [181]:
X.shape

(145293, 10)

## Modelling

In [242]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

### Naive bayes

In [183]:
from sklearn.naive_bayes import GaussianNB

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

scores=[]
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = GaussianNB(var_smoothing=1e-20)
    model.fit(X_train,y_train)
    y_pred=model.predict_proba(X_test)[:,1]
    scores.append(roc_auc_score(y_test,y_pred))
    


In [184]:
scores, np.mean(scores)

([0.9443973665341243,
  0.9456709553719872,
  0.9586641306034748,
  0.9469372900991972,
  0.9616512498273573],
 0.9514641984872281)

In [185]:
naive_bayes=GaussianNB(var_smoothing=1e-20)
naive_bayes.fit(X,y)


### Random Forest

In [186]:
# rf_params={'n_estimators': 100,
#  'max_features': 0.6897696969836908,
#  'max_leaf_nodes': 42,
#  'criterion': 'entropy'}
# rf_params={'n_estimators': 57,
#  'max_features': 0.2593823314046004,
#  'max_leaf_nodes': 78,
#  'criterion': 'entropy'}

rf_params={'n_estimators': 57,
 'max_features': 0.2593823314046004,
 'max_leaf_nodes': 78,
 'criterion': 'entropy'}

In [187]:
from sklearn.ensemble import RandomForestClassifier

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

scores=[]
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = RandomForestClassifier(n_jobs=-1,**rf_params)
    model.fit(X_train,y_train)
    y_pred=model.predict_proba(X_test)[:,1]
    scores.append(roc_auc_score(y_test,y_pred))

In [188]:
scores, np.mean(scores)

([0.9656464038512146,
  0.9641556743651476,
  0.9762810650615013,
  0.9645667384695025,
  0.9825348330149132],
 0.9706369429524558)

In [139]:
rfc=RandomForestClassifier(n_jobs=-1,**rf_params)
rfc.fit(X,y)

In [140]:
## feature importance

feature_importances = pd.DataFrame(rfc.feature_importances_,
                                        index = ct.get_feature_names_out(),
                                            columns=['importance']).sort_values('importance',ascending=False)

In [141]:
feature_importances

Unnamed: 0,importance
remainder__HDF,0.278621
remainder__OSF,0.205059
remainder__PWF,0.130718
remainder__TWF,0.104877
num__Torque [Nm],0.096396
num__Rotational speed [rpm],0.093099
num__Tool wear [min],0.036977
num__Air temperature [K],0.028906
cat__Product ID,0.012732
num__Process temperature [K],0.011482


## Catboost Classifier

In [268]:
# cat_params={'early_stopping_rounds': 11,
#  'learning_rate': 0.05444137644689418,
#  'n_estimators': 133,
#  'objective': 'Logloss',
#  'eval_metric': 'AUC',
#  'random_state': 42,
#  'verbose': 0,}
# cat_params = {'loss_function': 'Logloss',
#           'eval_metric': 'AUC',
#           'verbose': False,
#           'random_seed': 19970507,
#           'learning_rate': 0.032, 
#           'iterations': 2000, 
#           'depth': 5, 
#           'subsample': 0.705}

# cat_params = cb_params = {
#     'n_estimators': 1200,
#     'depth': 13,
#     'learning_rate': 0.0633180843164835,
#     'random_strength': 0.22,
#     'grow_policy': 'Lossguide',
#     'bootstrap_type': 'Bayesian',
#     'objective':'Logloss',
#     "loss_function": "AUC",
#     'eval_metric': "AUC",
#     'l2_leaf_reg': 3.0,
#     'min_child_samples': 3,
#     'random_state': 42,
#     'silent': True
# }

cat_params = {'loss_function': 'Logloss',
          'eval_metric': 'AUC',
          'verbose': False,
          'random_seed': 19970507,
          'learning_rate': 0.027, 
          'iterations': 927, 
          'depth': 5, 
          'subsample': 0.705}

In [269]:
feature_names=train_cat_X.columns.tolist()
categorical_features=train_cat_X.select_dtypes(include=['object']).columns.tolist()

In [270]:
categorical_features, feature_names

(['Product ID', 'Type'],
 ['Product ID',
  'Type',
  'Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]',
  'TWF',
  'HDF',
  'PWF',
  'OSF',
  'RNF',
  'HDF_TWF_OSF_PWF',
  'Power',
  'Temp/ratio [K]',
  'Tool wear speed',
  'Torque wear ratio',
  'Torque speed ratio',
  'Temperature difference'])

In [271]:
categorical_features

['Product ID', 'Type']

In [272]:
from catboost import CatBoostClassifier,Pool
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

scores=[]
for train_index, test_index in skf.split(train_cat_X, train_cat_y):
    X_train, X_test = train_cat_X.iloc[train_index], train_cat_X.iloc[test_index]
    y_train, y_test = train_cat_y.iloc[train_index], train_cat_y.iloc[test_index]
    train_pool=Pool(X_train.to_numpy(),y_train.to_numpy(),cat_features=categorical_features,feature_names=feature_names)
    test_pool=Pool(X_test.to_numpy(),y_test.to_numpy(),cat_features=categorical_features,feature_names=feature_names)
    model = CatBoostClassifier(**cat_params)
    model.fit(train_pool)
    y_pred=model.predict_proba(test_pool)[:,1]
    scores.append(roc_auc_score(y_test,y_pred))

In [273]:
scores, np.mean(scores)

([0.9771627045244073,
  0.9745242468279927,
  0.9861769910375583,
  0.973725203789514,
  0.9884791420868884],
 0.9800136576532722)

In [274]:
cat_boost=CatBoostClassifier(**cat_params)

train_pool=Pool(train_cat_X.to_numpy(),train_cat_y.to_numpy(),cat_features=categorical_features,feature_names=feature_names)

cat_boost.fit(train_pool)

<catboost.core.CatBoostClassifier at 0x15ca0bcd2b0>

## LightGBM

In [150]:
# params for without feature engineering and no class weights
# {'n_estimators': 437,
#  'num_leaves': 1137,
#  'min_child_samples': 21,
#  'learning_rate': 0.012788922736111897,
#  'log_max_bin': 10,
#  'colsample_bytree': 0.42015767404953996,
#  'reg_alpha': 0.001975258376030875,
#  'reg_lambda': 0.7239937504178731,
#  'random_state': 42}
# lgbm_params= {"n_estimators": 135, "num_leaves": 883, "min_child_samples": 7, "learning_rate": 0.0037226051855374824, "log_max_bin": 9, "colsample_bytree": 0.33966326518235646, "reg_alpha": 0.0024686655398856224, "reg_lambda": 0.0028201449346997825}
lgbm_params={'n_estimators': 2500,
 'num_leaves': 310,
 'min_child_samples': 10,
 'learning_rate': 0.0012323592719526815,
 'max_bin': 4096,
 'colsample_bytree': 0.2582472335147579,
 'reg_alpha': 0.0043659867548350275,
 'reg_lambda': 0.0015015946171746538,
 'random_state': 42}
#lgbm_params={'n_estimators': 1854, 'max_leaves': 128, 'min_child_weight': 3.5333625498641155, 'learning_rate': 0.08942572977482938, 'subsample': 0.9037251954669168, 'colsample_bylevel': 0.5009206003669185, 'colsample_bytree': 0.5780621665144878, 'reg_alpha': 1.5610252423873467, 'reg_lambda': 923.2310556341275, 'num_leaves': 104, 'max_depth': 12}

In [151]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scores=[]
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = LGBMClassifier(n_jobs=-1,**lgbm_params)
    model.fit(X_train,y_train)
    y_pred=model.predict_proba(X_test)[:,1]
    scores.append(roc_auc_score(y_test,y_pred))

In [152]:
scores, np.mean(scores)

([0.9807705908341653,
  0.974673646410668,
  0.9833201640409578,
  0.976816619439171,
  0.9878497814137999],
 0.9806861604277524)

In [153]:
lgbm = LGBMClassifier(n_jobs=-1,**lgbm_params)

lgbm.fit(X,y)

In [154]:
lgbm.score(X,y)

0.9956570516129476

## HistGradientBossting

In [155]:
hist_params={
    'max_iter': 300,
    'scoring': 'roc_auc',
    'random_state': 42,
    'interaction_cst':'pairwise'
}

In [156]:
from sklearn.ensemble import HistGradientBoostingClassifier


skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scores=[]
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = HistGradientBoostingClassifier(**hist_params)
    model.fit(X_train,y_train)
    y_pred=model.predict_proba(X_test)[:,1]
    scores.append(roc_auc_score(y_test,y_pred))

In [157]:
scores,np.mean(scores)

([0.9591929667448174,
  0.9611585582599437,
  0.9753410256174035,
  0.9586151508835938,
  0.9828188015731532],
 0.9674253006157822)

In [158]:
hgbc=HistGradientBoostingClassifier(**hist_params)

hgbc.fit(X,y)

## XGBoost

In [159]:
def auc_loss_func(y_true,y_pred):
    
    auc=roc_auc_score(y_true,y_pred)
    grad=(y_pred-y_true)*(2.0*auc-1.0)
    hess=2.0*auc*(1.0-auc)*y_pred*(1.0-y_pred)
    
    return grad,hess

In [160]:
# without feature engineering and no class weights
# xgboost_params={'n_estimators': 3697,
#  'max_leaves': 287,
#  'min_child_weight': 32.83638391565163,
#  'learning_rate': 0.022020947638032907,
#  'subsample': 1.0,
#  'colsample_bylevel': 0.6595695471285861,
#  'colsample_bytree': 0.7035132649686466,
#  'reg_alpha': 0.0018885703990141817,
#  'reg_lambda': 14.653054597011067,
#  'tree_method': 'gpu_hist'}

# xgboost_params={'n_estimators': 97,
#  'max_leaves': 14,
#  'min_child_weight': 45.136292363100196,
#  'learning_rate': 0.19997653978110663,
#  'subsample': 0.8895588746662894,
#  'colsample_bylevel': 0.7728861115770346,
#  'colsample_bytree': 0.7167459701136535,
#  'reg_alpha': 0.0015245843735931768,
#  'reg_lambda': 0.07655787774671213}

# xgboost_params={'n_estimators': 316,
#  'max_leaves': 155,
#  'min_child_weight': 1.6740420230031532,
#  'learning_rate': 0.01613649125193271,
#  'subsample': 0.8877605142833445,
#  'colsample_bylevel': 0.3024136423540494,
#  'colsample_bytree': 0.7368703619172471,
#  'reg_alpha': 0.01465952112151396,
#  'reg_lambda': 0.32682646203137,
#  'loss_function':auc_loss_func}

xgboost_params={'n_estimators': 1029,
                'learning_rate': 0.18307961385595686,
                'max_depth': 4,
                'subsample': 0.995605266371666,
                'colsample_bytree': 0.9491496175407161}


In [161]:
from xgboost import XGBClassifier

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

scores=[]
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = XGBClassifier(**xgboost_params)
    model.fit(X_train,y_train)
    y_pred=model.predict_proba(X_test)[:,1]
    scores.append(roc_auc_score(y_test,y_pred))

In [162]:
# old  0.958956644689253 from no feature engineering and no class weights
# 0.9662774120054027 from no feature engineering and no class weights
scores, np.mean(scores)

([0.9616723323476085,
  0.9568547638655807,
  0.9689862810225418,
  0.9549670176803979,
  0.9731414314671105],
 0.9631243652766479)

## Voting

In [163]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
estimators = [
    ('cat', CatBoostClassifier(**cat_params)),
    ('lgbm', LGBMClassifier(n_jobs=-1,**lgbm_params))
]

In [470]:
class VotingClassifierMod():
    def __init__(self, estimators, voting='soft', weights=None):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
    
    def fit(self,trainx,trainy,traincat):
        for name,estimator in self.estimators:
            if name=='cat':
                estimator.fit(traincat)
            else:
                estimator.fit(trainx,trainy)
                
    def predict_proba(self,testx,testcat):
        probas=[]
        for name,estimator in self.estimators:
            if name=='cat':
                probas.append(estimator.predict_proba(testcat)[:,1])
            else:
                probas.append(estimator.predict_proba(testx)[:,1])
        if self.voting=='soft':
            return np.average(probas,axis=0,weights=self.weights)
        else:
            return np.argmax(probas,axis=0)
                
    

In [471]:
from sklearn.model_selection import StratifiedKFold
from catboost import Pool
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

scores=[]
for train_index, test_index in skf.split(X, y):
    trainx, testx = X[train_index], X[test_index]
    trainy, testy = y.iloc[train_index], y.iloc[test_index]
    # create catboost pool
    trainx_cat,testx_cat=train_cat_X.iloc[train_index],train_cat_X.iloc[test_index]
    trainy_cat,testy_cat=train_cat_y.iloc[train_index],train_cat_y.iloc[test_index]
    traincat = Pool(data=trainx_cat,label=trainy_cat,cat_features=categorical_features,feature_names=feature_names)
    testcat = Pool(data=testx_cat,label=testy_cat,cat_features=categorical_features,feature_names=feature_names)
    model = VotingClassifierMod(estimators=estimators,voting='soft',weights=[0.5,0.5])
    model.fit(trainx,trainy,traincat)
    y_pred=model.predict_proba(testx,testcat)
    scores.append(roc_auc_score(testy,y_pred))

In [472]:
# 0.9805112500800615 4 estimators with 0.6, 0.2,0.2,0.6 weights
# 0.9816249107171352 2 estimators with 0.5, 0.5 weights lgbm and catboost lgbm max_bin=4096
scores, np.mean(scores)

([0.9776553894124264,
  0.9785468873974492,
  0.9912627709655725,
  0.9826837052535604,
  0.9769331111670513],
 0.981416372839212)

In [473]:
vc=VotingClassifierMod(estimators=estimators,voting='soft',weights=[0.5,0.5])
traincat=Pool(data=train_cat_X,label=train_cat_y,cat_features=categorical_features,feature_names=feature_names)
vc.fit(X,y,traincat)

## Stacking Classifier

## Hyper parameter tuning using optuna

In [139]:
# define the hyperparameter space for lgbm
from optuna.samplers import TPESampler
import optuna
from lightgbm import LGBMClassifier
from optuna import Trial, visualization
from optuna.pruners import SuccessiveHalvingPruner
from sklearn.model_selection import StratifiedKFold

def objective(trial: Trial):
    params={
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'max_leaves': trial.suggest_int('max_leaves', 10, 200),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 1e3),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1e3),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1e3),
        'num_leaves': trial.suggest_int('num_leaves', 10, 200),
        'max_depth': trial.suggest_int('max_depth', 0, 20),
        'random_state': 42,
        'n_jobs': -1,
    }
    skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    scores=[]
    i=0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model=LGBMClassifier(**params)
        model.fit(X_train,y_train)
        y_pred=model.predict_proba(X_test)[:,1]
        trial.report(roc_auc_score(y_test,y_pred), i)
        if trial.should_prune():
            raise optuna.TrialPruned()
        scores.append(roc_auc_score(y_test,y_pred))
        i+=1
    return np.mean(scores)

sampler = optuna.samplers.TPESampler(seed=42)
pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=2, min_early_stopping_rate=0)
study = optuna.create_study(pruner=pruner,sampler=sampler,direction='maximize')
study.optimize(objective, n_trials=100,n_jobs=-1,show_progress_bar=True)

In [140]:
study.optimize(objective, n_trials=100,n_jobs=-1,show_progress_bar=True)

In [141]:
    # Define the hyperparameter space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_leaves': trial.suggest_int('max_leaves', 10, 200),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 1e3),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1e3),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1e3),
    }
    
    model = XGBClassifier(**params,loss_function=auc_loss_func)
    
    skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    scores=[]
    i=0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train,y_train)
        y_pred=model.predict_proba(X_test)[:,1]
        trial.report(roc_auc_score(y_test,y_pred), step=i)
        scores.append(roc_auc_score(y_test,y_pred))
        if trial.should_prune():
            raise optuna.TrialPruned()
        i+=1
    return np.mean(scores)

In [142]:
import optuna
from optuna.pruners import SuccessiveHalvingPruner
# tune the weights of the voting classifier
def objective(trial: optuna.Trial):
    w1 = trial.suggest_float("w1", 0.0, 1.0)
    w2 = trial.suggest_float("w2", 0.0, 1.0)
    w3 = trial.suggest_float("w3", 0.0, 1.0)
    w4 = trial.suggest_float("w4", 0.0, 1.0)
    w5 = trial.suggest_float("w5", 0.0, 1.0)
    weights=[w1,w2,w3,w4,w5]
    skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
    scores=[]
    i=0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        vc=VotingClassifier(estimators=estimators,n_jobs=-1,voting='soft',weights=weights)
        vc.fit(X_train,y_train)
        y_pred=vc.predict_proba(X_test)[:,1]
        scores.append(roc_auc_score(y_test,y_pred))
        i+=1
        
    return np.mean(scores)
        

        
        
sampler = optuna.samplers.TPESampler(seed=42)
pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=2, min_early_stopping_rate=0)
study = optuna.create_study(pruner=pruner,sampler=sampler,direction='maximize')
study.optimize(objective, n_trials=100,n_jobs=-1,show_progress_bar=True)

## AutoML

In [71]:
from flaml import AutoML

automl=AutoML()

automl_settings={
    'time_budget':3000,
    'metric':'roc_auc',
    'task':'classification',
    'ensemble':True,
    'n_jobs':-1,
    'log_file_name':'automl.log',
    "eval_method":'cv',
    'n_splits':5,
    'split_type':'stratified',
    'verbose':3
}

automl.fit(X_train=X,y_train=y,**automl_settings)

## Submission

In [275]:
test=pd.read_csv('test.csv')

In [276]:
test.isnull().sum()

id                         0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [277]:
test_id=test['id']
test_data=test.drop(['id'],axis=1)
#test_data=test.drop(['id','Product ID'],axis=1)

In [278]:
test_data=create_features(test_data)

In [279]:
test_X=ct.transform(test_data)

In [280]:
test_pool=Pool(data=test_data,feature_names=test_data.columns.to_list(),cat_features=categorical_features)

In [281]:
preds=cat_boost.predict_proba(test_pool)

In [282]:
preds=preds[:,1]


In [283]:
# preds2=lgbm.predict_proba(test_X)

In [284]:
# preds3=naive_bayes.predict_proba(test_X)
# pred4=rfc.predict_proba(test_X)

In [285]:
# preds1=preds1[:,1]
# preds2=preds2[:,1]
# preds3=preds3[:,1]
# pred4=pred4[:,1]


In [286]:
# # where the lgbm value is greater than 0.5 use the lgbm value else use the catboost value

# preds=np.where(preds2>0.5,preds2,preds1)

# ## where the value is less than 0.001 replace it with 0

# preds=np.where(preds<0.001,0,preds)

In [287]:
# # catboost weight 0.6, lgbm weight 0.2, naive bayes weight 0.1, rfc weight 0.1

# preds=0.5*preds1+0.5*preds2

In [288]:
# # where the value is greater than 0.5 in for all pred1,pred2,pred3,pred4 use the value 

# preds=np.where((preds1>=0.5) & (preds2>=0.5) & (preds3>=0.5) & (pred4>=0.5),1,preds)

# # where the value is less than 0.5 for all pred1,pred2,pred3,pred4 use the value

# preds=np.where((preds1<0.5) & (preds2<0.5) & (preds3<0.5) & (pred4<0.5),0,preds)


In [289]:
# preds=np.mean([preds,preds2],axis=0)

In [290]:
# preds=preds[:,1]

In [291]:
sub=pd.DataFrame({'id':test_id,'Machine failure':preds})

In [292]:
# ## where prob of failure is less than 0.01, set to 0
# sub.loc[sub['Machine failure']<0.01,'Machine failure']=0
# # where prob of failure is greater than 0.90, set to 1

# sub.loc[sub['Machine failure']>0.90,'Machine failure']=1

In [293]:
sub.to_csv('sub.csv',index=False)

In [294]:
sub[sub['Machine failure']>=0.5].shape

(1133, 2)

In [295]:
sub['Machine failure'].value_counts()

0.007057    16
0.002272    13
0.002033     8
0.000920     5
0.317021     5
            ..
0.000382     1
0.000438     1
0.000835     1
0.000550     1
0.000504     1
Name: Machine failure, Length: 90290, dtype: int64

In [296]:
!kaggle competitions submit -c playground-series-s3e17 -f sub.csv -m "lol"

Successfully submitted to Binary Classification of Machine Failures


  0%|          | 0.00/2.58M [00:00<?, ?B/s]
  0%|          | 8.00k/2.58M [00:00<01:48, 24.8kB/s]
  4%|▎         | 96.0k/2.58M [00:00<00:09, 265kB/s] 
  5%|▌         | 144k/2.58M [00:00<00:15, 167kB/s] 
  7%|▋         | 176k/2.58M [00:01<00:18, 137kB/s]
  8%|▊         | 200k/2.58M [00:01<00:19, 126kB/s]
  8%|▊         | 224k/2.58M [00:01<00:20, 122kB/s]
  9%|▉         | 240k/2.58M [00:01<00:20, 119kB/s]
 10%|▉         | 256k/2.58M [00:02<00:20, 119kB/s]
 10%|█         | 272k/2.58M [00:02<00:21, 115kB/s]
 11%|█         | 288k/2.58M [00:02<00:21, 112kB/s]
 11%|█▏        | 304k/2.58M [00:02<00:21, 112kB/s]
 12%|█▏        | 320k/2.58M [00:02<00:21, 112kB/s]
 13%|█▎        | 336k/2.58M [00:02<00:21, 110kB/s]
 13%|█▎        | 352k/2.58M [00:02<00:21, 108kB/s]
 14%|█▍        | 368k/2.58M [00:03<00:51, 44.9kB/s]
 15%|█▍        | 384k/2.58M [00:04<00:44, 51.6kB/s]
 15%|█▍        | 392k/2.58M [00:04<00:42, 54.4kB/s]
 18%|█▊        | 480k/2.58M [00:04<00:13, 166kB/s] 
 19%|█▉        | 512k/2.58M 




In [10]:
import numpy as np 

# matrix A
A=np.array([[1,3,2],[6,4,3],[1,4,9]])
B=  np.array([[1,2,3],[4,5,6],[7,8,9]])

np.einsum('ij,jk->ik',A,B)

array([[ 27,  33,  39],
       [ 43,  56,  69],
       [ 80,  94, 108]])

In [None]:
# Q: what does np.einsum('il,lj',A,B) do?
# A: it does matrix multiplication

In [2]:
# multiply A and B

# matrix multiplication

A=np.array([[1,3,2],[6,4,3],[1,4,9]])
B=  np.array([[1,2,3],[4,5,6],[7,8,9]])

np.matmul(A,B)

array([[ 27,  33,  39],
       [ 43,  56,  69],
       [ 80,  94, 108]])