In [1]:
!pip install catboost
!pip install lightgbm
!pip install treeviz

Collecting treeviz
  Downloading treeviz-1.1.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: treeviz
  Building wheel for treeviz (setup.py) ... [?25ldone
[?25h  Created wheel for treeviz: filename=treeviz-1.1-py3-none-any.whl size=3318 sha256=8bfa19f40f2f0bd90754d2397c62b18fd87e599b7721b0faa4e6d580266190ea
  Stored in directory: /root/.cache/pip/wheels/c8/10/db/c09faa7144d0b5f0b3f7a691dd7f7a5b42e139f3b35273192c
Successfully built treeviz
Installing collected packages: treeviz
Successfully installed treeviz-1.1


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
pd.options.display.max_columns = None
pd.options.display.max_rows = None

test_df = pd.read_csv(Path('/kaggle/input/playground-series-s4e8/test.csv'))
train_df = pd.read_csv(Path('/kaggle/input/playground-series-s4e8/train.csv'))

In [3]:
# Script to reduce memory usage
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > -128 and c_max < 127:
                    df[col] = df[col].astype('int8')
                elif c_min > -32768 and c_max < 32767:
                    df[col] = df[col].astype('int16')
                elif c_min > -2147483648 and c_max < 2147483647:
                    df[col] = df[col].astype('int32')
                else:
                    df[col] = df[col].astype('int64')
            else:
                if c_min > -3.4e+38 and c_max < 3.4e+38:
                    df[col] = df[col].astype('float32')
                else:
                    df[col] = df[col].astype('float64')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {(start_mem - end_mem) / start_mem * 100:.1f}%')

    return df

# Apply to your DataFrame
train_df = reduce_memory_usage(train_df)
test_df = reduce_memory_usage(test_df)

Memory usage of dataframe is 523.17 MB
Memory usage after optimization is: 475.61 MB
Decreased by 9.1%
Memory usage of dataframe is 332.93 MB
Memory usage after optimization is: 301.22 MB
Decreased by 9.5%


In [4]:
train_df.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,6.85,9.93,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,4.16,6.53,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,3.37,8.36,,,w,,,f,f,,g,a


In [5]:
Missing = train_df.isna().mean()*100
Missing.sort_values(ascending=False, inplace=True)
Missing

veil-type               94.884350
spore-print-color       91.425482
stem-root               88.452732
veil-color              87.936970
stem-surface            63.551362
gill-spacing            40.373988
cap-surface             21.528227
gill-attachment         16.809280
ring-type                4.134818
gill-color               0.001829
habitat                  0.001444
cap-shape                0.001283
stem-color               0.001219
has-ring                 0.000770
cap-color                0.000385
does-bruise-or-bleed     0.000257
cap-diameter             0.000128
id                       0.000000
stem-width               0.000000
class                    0.000000
stem-height              0.000000
season                   0.000000
dtype: float64

In [6]:
train_df.drop(columns=['veil-type', 'spore-print-color', 'stem-root', 'veil-color', 'stem-surface'], inplace=True)
test_df.drop(columns=['veil-type', 'spore-print-color', 'stem-root', 'veil-color', 'stem-surface'], inplace=True)

In [7]:
train_df.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,w,f,f,d,a
1,1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,o,t,z,d,w
2,2,e,6.94,f,s,b,f,x,c,w,6.85,9.93,n,f,f,l,w
3,3,e,3.88,f,y,g,f,s,,g,4.16,6.53,w,f,f,d,u
4,4,e,5.85,x,l,w,f,d,,w,3.37,8.36,w,f,f,g,a


In [8]:
# let's define valid categories for each column
valid_categories = {
    'cap-shape': ['f', 'x', 'p', 'b', 'o', 'c', 's'],
    'cap-surface': ['s', 'h', 'y', 'l'],
    'cap-color': ['u', 'o', 'b', 'g', 'w', 'n', 'e', 'y', 'r', 'p', 'k'],
    'does-bruise-or-bleed': ['f', 't'],
    'gill-attachment': ['a', 'x', 's', 'd'],
    'gill-spacing': ['c', 'd'],
    'gill-color': ['w', 'n', 'g', 'k'],
    'stem-color': ['w', 'o', 'n', 'y', 'e'],
    'has-ring': ['f', 't'],
    'ring-type': ['f', 'z', 'e', 'p'],
    'habitat': ['d', 'l', 'g', 'h', 'p', 'm', 'u']}

In [9]:
# Fill non-valid with NaN
columns_to_clean = ['cap-shape', 'cap-surface', 'cap-color','does-bruise-or-bleed', 'gill-attachment',
                'gill-spacing','gill-color','stem-color','has-ring','ring-type','habitat']
for col in columns_to_clean:
    train_df[col] = train_df[col].apply(lambda x: x if x in valid_categories[col] else np.nan)
    test_df[col] = test_df[col].apply(lambda x: x if x in valid_categories[col] else np.nan)

In [10]:
# Fill missing with median in the numerical columns
numerical_columns = ['cap-diameter', 'stem-height', 'stem-width']
for col in numerical_columns:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(test_df[col].median())

In [11]:
# Fill missing with mode in the categorical columns
categorical_columns = ['cap-shape', 'cap-surface', 'cap-color','does-bruise-or-bleed', 'gill-attachment',
                'gill-spacing','gill-color','stem-color','has-ring','ring-type','habitat', 'season']
for col in categorical_columns:
    mode_train = train_df[col].mode()[0]
    train_df[col] = train_df[col].fillna(mode_train)
    mode_test = train_df[col].mode()[0]
    test_df[col] = test_df[col].fillna(mode_test)

In [24]:
print("There are", train_df.isnull().sum().sum(), "missing value in train_data")
print("There are", test_df.isnull().sum().sum(), "missing value in test_data")

There are 0 missing value in train_data
There are 0 missing value in test_data


In [13]:
from sklearn.preprocessing import StandardScaler

# StandardScaler.fit_transform is just:
# X_scaled = (x−mean) / std
# Used for normalization
scaler = StandardScaler()
train_df[numerical_columns] = scaler.fit_transform(train_df[numerical_columns])
test_df[numerical_columns] = scaler.fit_transform(test_df[numerical_columns])

In [25]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

for col in categorical_columns:
    train_df[col] = train_df[col].astype(str)
    train_df[col] = LE.fit_transform(train_df[col])
    test_df[col] = test_df[col].astype(str)
    test_df[col] = LE.fit_transform(test_df[col])

In [26]:
train_df['class'] = LE.fit_transform(train_df['class'].astype(str))

In [27]:
cols = numerical_columns + categorical_columns
X = train_df[cols]
y = train_df['class']
X.head()

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-color,has-ring,ring-type,habitat,season
0,0.534605,-0.680926,0.523282,2,2,10,0,0,0,3,3,0,1,0,0
1,-0.386405,-0.577213,-0.577333,6,0,7,0,0,0,2,2,1,3,0,3
2,0.135286,0.185819,-0.151169,2,2,0,0,3,0,3,1,0,1,3,3
3,-0.521659,-0.810568,-0.571157,2,3,4,0,2,0,0,3,0,1,0,2
4,-0.098724,-1.103187,-0.345105,6,1,2,0,1,0,3,3,0,1,1,0


In [28]:
display(X.shape)
display(y.shape)

(3116945, 15)

(3116945,)

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [32]:
from sklearn.
from xgboost import XGBClassifier

model = XGBClassifier(colsample_bylevel=0.9,
                      learning_rate=0.2,
                      colsample_bytree=0.8,
                      gamma=0.1,
                      max_depth=10,
                      min_child_weight=1,
                      n_estimators=250,
                      nthread=4,
                      random_state=42,
                     )
model.fit(X_train, y_train)
valid_predictions = model.predict(X_valid)

NameError: name 'accuracy_score' is not defined

In [34]:
from sklearn.metrics import classification_report

accuracy = balanced_accuracy_score(y_valid, valid_predictions)
classification_rep = classification_report(y_valid, valid_predictions)

print(accuracy)
print(classification_rep)

0.986980485108669
              precision    recall  f1-score   support

           0       0.98      0.99      0.99    282310
           1       0.99      0.99      0.99    341079

    accuracy                           0.99    623389
   macro avg       0.99      0.99      0.99    623389
weighted avg       0.99      0.99      0.99    623389



In [38]:
# Reordering the columns to be able to predict
cols_when_model_builds = model.get_booster().feature_names
test_df = test_df[cols_when_model_builds]

In [40]:
# Now let's predict the test_df
csv_predictions = model.predict(test_df)

In [41]:
csv_predictions

array([0, 1, 1, ..., 1, 0, 0])

In [45]:
modified_predictions = ['p' if pred == 1 else 'e' for pred in csv_predictions]
test_df = pd.read_csv(Path('/kaggle/input/playground-series-s4e8/test.csv'))
submission_csv = pd.DataFrame({
    'id': test_df['id'],
    'class': modified_predictions
})
submission_csv.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e


In [46]:
submission_csv.to_csv('submission.csv', index=False)