In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
import scipy.stats as st

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import r2_score, f1_score, recall_score
from sklearn import tree

import lightgbm as lgb

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

%matplotlib inline

In [3]:
df = pd.read_csv("data\mushroom_cleaned.csv")
df_second = pd.read_csv("data\secondary_data.csv", sep=";")

In [12]:
df.head()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1


In [15]:
working_df = df.copy()
features = working_df.drop(columns=["class"])
target = working_df["class"]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=42)

norm = MinMaxScaler()
norm.fit(X_train)
X_train_norm = pd.DataFrame(norm.transform(X_train), columns=X_train.columns)
X_test_norm = pd.DataFrame(norm.transform(X_test), columns=X_test.columns)

knn = KNeighborsClassifier()
grid = {"n_neighbors":[x+1 for x in range(20)]}

gscv = GridSearchCV(knn, grid, cv=5)
gscv.fit(X_train_norm, y_train)
gscv.best_params_

{'n_neighbors': 3}

In [16]:
optimal_knn = KNeighborsClassifier(n_neighbors=3)
optimal_knn.fit(X_train_norm, y_train)
knn_pred = optimal_knn.predict(X_test_norm)

recall_score(y_test, knn_pred)

0.993218040013564

**--**

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
tree = DecisionTreeClassifier()
tree_grid = {"max_leaf_nodes":[5,10,15,20,None], "max_depth":[5,10,15,None], "random_state":[42]}

tree_gscv = GridSearchCV(tree, tree_grid, cv=5)
tree_gscv.fit(X_train_norm, y_train)
tree_gscv.best_params_

{'max_depth': None, 'max_leaf_nodes': None, 'random_state': 42}

In [27]:
optimal_tree = DecisionTreeClassifier(random_state=42)
optimal_tree.fit(X_train_norm, y_train)
tree_pred = optimal_tree.predict(X_test_norm)

recall_score(y_test, tree_pred)

0.9821973550356052

**--**

In [31]:
model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=15,random_state=42)
model.fit(X_train_norm, y_train, eval_set=[(X_test_norm, y_test),(X_train_norm, y_train)],eval_metric='logloss')
lgbm_pred = model.predict(X_test_norm)

recall_score(y_test, lgbm_pred)

[LightGBM] [Info] Number of positive: 23777, number of negative: 19451
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 808
[LightGBM] [Info] Number of data points in the train set: 43228, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.550037 -> initscore=0.200820
[LightGBM] [Info] Start training from score 0.200820


0.9777890810444219