In [1]:
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [2]:
train_data_path = 'train.csv'
df = pd.read_csv(train_data_path)

In [3]:
# Check for columns that are entirely NaN
all_nan_columns = df.columns[df.isnull().all()].tolist()

if all_nan_columns:
    print("Columns that are entirely NaN:")
    for col in all_nan_columns:
        print(f"- {col}")
else:
    print("No columns are entirely NaN.")

nan_percentage = df.isnull().mean() * 100

print("\nPercentage of NaN values in each column:")
print(nan_percentage)

high_nan_columns = nan_percentage[nan_percentage > 50].sort_values(ascending=False)

if not high_nan_columns.empty:
    print("\nColumns with more than 90% NaN values:")
    print(high_nan_columns)
else:
    print("\nNo columns have more than 90% NaN values.")
drop_cols = high_nan_columns.index.tolist()
df = df.drop(columns=drop_cols)

No columns are entirely NaN.

Percentage of NaN values in each column:
id                       0.000000
class                    0.000000
cap-diameter             0.000128
cap-shape                0.001283
cap-surface             21.528227
cap-color                0.000385
does-bruise-or-bleed     0.000257
gill-attachment         16.809280
gill-spacing            40.373988
gill-color               0.001829
stem-height              0.000000
stem-width               0.000000
stem-root               88.452732
stem-surface            63.551362
stem-color               0.001219
veil-type               94.884350
veil-color              87.936970
has-ring                 0.000770
ring-type                4.134818
spore-print-color       91.425482
habitat                  0.001444
season                   0.000000
dtype: float64

Columns with more than 90% NaN values:
veil-type            94.884350
spore-print-color    91.425482
stem-root            88.452732
veil-color           87.936970
st

In [4]:
LABEL = 'class'
label_2_class = {
    'e':1,
    'p':0
}

df['label'] = df[LABEL].apply(lambda x: label_2_class[x])
df_filled = df.fillna('Missing')

In [5]:
label_encoder = LabelEncoder()
features_encoding = [f for f in df_filled if f not in ['label','id','class','cap-diameter','stem-height','stem-width']]
for f in features_encoding:
    df_filled[f] = label_encoder.fit_transform(df_filled[f])
df_feats = df_filled[features_encoding+['cap-diameter','stem-height','stem-width','label']]
df_feats = df_filled[features_encoding+['cap-diameter','stem-height','stem-width','label']]

standard_scaler = StandardScaler()
df_feats[features_encoding] = standard_scaler.fit_transform(df_feats[features_encoding])

In [6]:
label = 'label'
features = [f for f in df_feats if f not in [label]]
print(features,label)

['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season', 'cap-diameter', 'stem-height', 'stem-width'] label


In [7]:
df_feats = df_feats[df_feats['cap-diameter'] != 'Missing']

In [8]:
df_feats['cap-diameter'] = df_feats['cap-diameter'].astype(float)
df_feats['stem-height'] = df_feats['stem-height'].astype(float)
df_feats['stem-width'] = df_feats['stem-width'].astype(float)

In [9]:
import numpy as np

def process_data(df,drop_cols):
    df = df.drop(columns=drop_cols)
    df_filled = df.fillna('Missing')
    label_encoder = LabelEncoder()
    features_encoding = [f for f in df_filled if f not in ['label','id','class','cap-diameter','stem-height','stem-width']]
    for f in features_encoding:
        df_filled[f] = label_encoder.fit_transform(df_filled[f])
    df_feats = df_filled[features_encoding+['cap-diameter','stem-height','stem-width']]
    standard_scaler = StandardScaler()
    df_feats[features_encoding] = standard_scaler.fit_transform(df_feats[features_encoding])
    df_feats = df_feats.replace('Missing',np.nan)
    df_feats = df_feats.fillna(df_feats.mean())
    df_feats['cap-diameter'] = df_feats['cap-diameter'].astype(float)
    df_feats['stem-height'] = df_feats['stem-height'].astype(float) 
    df_feats['stem-width'] = df_feats['stem-width'].astype(float)
    return df_feats 

test_df = pd.read_csv('test.csv')
processed_test = process_data(test_df,drop_cols)
processed_test

Unnamed: 0,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-color,has-ring,ring-type,habitat,season,cap-diameter,stem-height,stem-width
0,0.885722,-1.359295,-0.203819,2.168990,-0.928677,-1.063864,0.853590,0.860480,1.776959,-0.014607,-0.524046,-0.953741,8.64,11.13,17.12
1,-0.103928,1.048558,-0.046744,-0.461034,-0.928677,0.449025,1.097940,-0.843184,-0.562725,-0.223898,-0.524046,-0.953741,6.90,1.27,10.75
2,-1.643383,-0.456350,-0.203819,-0.461034,-0.928677,0.449025,-0.490340,-0.843184,-0.562725,-0.223898,-0.524046,-0.048418,2.00,6.18,3.14
3,0.885722,1.048558,-0.203819,-0.461034,1.273164,0.449025,-0.490340,0.860480,1.776959,4.171209,-0.524046,0.856905,3.47,4.98,8.51
4,0.885722,-0.356023,1.524009,-0.461034,1.063465,-1.063864,1.097940,1.122582,1.776959,-1.688934,-0.524046,0.856905,6.17,6.73,13.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,0.885722,-0.456350,1.209859,-0.461034,-0.823827,0.953321,0.853590,-2.022644,-0.562725,-0.223898,-0.524046,0.856905,0.88,2.67,1.35
2077960,0.885722,0.747576,1.209859,-0.461034,-0.404429,0.449025,0.853590,0.860480,-0.562725,-0.223898,0.353559,-0.953741,3.12,2.69,7.38
2077961,0.885722,-0.657005,-1.774572,-0.461034,-0.823827,-1.063864,0.853590,1.122582,1.776959,4.171209,-0.524046,-0.953741,5.73,6.16,9.74
2077962,-1.643383,-0.456350,-0.203819,-0.461034,-0.823827,0.953321,-1.712094,-1.760542,-0.562725,-0.223898,-0.524046,-0.953741,5.03,6.00,3.46


In [10]:
params = {
    'objective': 'binary',  # for binary classification
    'metric': 'mcc',        # use MCC as the evaluation metric
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [11]:
print(features,label)


['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season', 'cap-diameter', 'stem-height', 'stem-width'] label


In [12]:
from sklearn.model_selection import KFold

pred_df = pd.DataFrame()
kf = KFold(n_splits=5,shuffle=True,random_state=42)
for fold, (train_index, test_index) in enumerate(kf.split(df_feats)):
    print(f'fold:{fold}----------------------------')
    train_data = lgb.Dataset(df_feats.iloc[train_index][features],label=df_feats.iloc[train_index][label])
    test_data = lgb.Dataset(df_feats.iloc[test_index][features],label=df_feats.iloc[test_index][label])
    model = lgb.train(params,train_data,num_boost_round=1000,valid_sets=test_data,callbacks=[lgb.early_stopping(stopping_rounds=100)])
    pred_df[fold] = model.predict(processed_test[features])





fold:0----------------------------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[411]	valid_0's auc: 0.996784
fold:1----------------------------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[676]	valid_0's auc: 0.99687
fold:2----------------------------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[750]	valid_0's auc: 0.996804
fold:3----------------------------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[745]	valid_0's auc: 0.996915
fold:4----------------------------
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[491]	valid_0's auc: 0.996771


In [13]:
pred_df['prediction'] = pred_df.mean(axis=1)
pred_df['prediction'] = pred_df['prediction'].apply(lambda x: 1 if x > 0.5 else 0)

In [14]:
class_2_label = {
    1:'e',
    0:'p'
}
pred_df['prediction'] = pred_df['prediction'].apply(lambda x: class_2_label[x])

In [15]:
submission = pd.read_csv('test.csv')
submission['class'] = pred_df['prediction']
submission_x = submission[['id','class']]
submission_x.to_csv('submission.csv',index=False)