<a href="https://www.kaggle.com/code/shariq20220/binary-prediction-of-poisonous-mushrooms?scriptVersionId=192599642" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os

# import mplcursors  ## for hovering in graphs and getting value
warnings.filterwarnings('ignore')

## Checking availability of Datasets

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    print("dirname:" , dirname)
    if len(filenames) == 0 :
        print("no file in the directory!! Please add a dataset ")  
        
    else:
        for filename in filenames:
            print(os.path.join(dirname, filename))
        
       
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading Datasets

In [None]:
data_train=pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")
data_test=pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv")
data_sample_sub=pd.read_csv("/kaggle/input/playground-series-s4e8/sample_submission.csv")

## EDA on Training Dataset


In [None]:
data_train.shape

In [None]:
data_train.head(20)

In [None]:
data_train.info()

In [None]:
data_train.describe(include = 'all')

In [None]:
data_train.isna().sum()

## EDA on Testing Dataset

In [None]:
data_test.shape

In [None]:
data_test.info()

## EDA on Sample_submission Dataset

In [None]:
data_sample_sub.shape

# Pre-Processing Datasets

In [None]:
data_train2 = data_train.copy()
data_train2 = data_train.drop_duplicates()
data_train2.shape

In [None]:
data_train2.isna().sum()

In [None]:
data_train2.isna().mean()*100

In [None]:
miss70 = data_train2.isna().mean()*100
miss70[miss70>70]

In [None]:
classes_with_id = data_train2[['id','class']]
classes_with_id.shape

In [None]:
data_test2 = data_test.copy()
data_test2 = data_test.drop_duplicates()
data_test2.shape

In [None]:
data_train2  = data_train2.drop(columns = {'id','stem-root','class','veil-type','veil-color','spore-print-color'}, axis =1)
print(data_train2.shape)
data_test2 = data_test2.drop(columns={'id','stem-root','veil-type','veil-color','spore-print-color'}, axis =1)
print(data_test2.shape)

In [None]:
data_train2.info()

In [None]:
data_train2.isnull().sum()

In [None]:
data_test2.info()

In [None]:
data_test2.isnull().sum()

In [None]:
train2_shape = data_train2.shape
test2_shape  = data_test2.shape
train2_shape,test2_shape,train2_shape[0],test2_shape[0]

## Combining both train and test datasets for easier processing

In [None]:
com_data = pd.concat([data_train2,data_test2],ignore_index = True)
com_data

In [None]:
com_data.shape

In [None]:
com_data.info()

In [None]:
com_data.describe(include = [np.number])

In [None]:
com_data.describe(include = ['object'])

In [None]:
com_data.isna().sum()

In [None]:
for column in com_data.select_dtypes(include = [np.number]).columns:
    print(column)
    
    plt.figure(figsize=(14,6))
    plt.tight_layout()
    
    plt.subplot(2,2,1)
    sns.histplot(data = com_data[f'{column}'],kde = True, palette = 'viridis')
    plt.xlabel(f'{column}')
    plt.axis()
    plt.title(f'Histogram for {column}')
    
    plt.subplot(2,2,2)
    sns.boxplot(data = com_data,x=com_data[f'{column}'],width = 0.8,palette = 'rocket',color = 'red')
    plt.xlabel(f'{column}')
    plt.title(f'Box-Plot for {column}')
    
    plt.show()

In [None]:
import plotly.express as px

In [None]:
for column in com_data.select_dtypes(include = [np.number]).columns:
    print(column)
    
    # Create a histogram with KDE overlay
    fig = px.histogram(data_frame = com_data,
                       x = column,
                       marginal="violin", # Adds a KDE/violin plot on the side
                       opacity=0.3,
                       title='Histogram with KDE')

    # Update layout for clarity
    fig.update_layout(
        xaxis_title= f'{column}',
        yaxis_title='Count',
        bargap=0.2,
        showlegend=True
    )

    # Show the plot
    fig.show()
    
    
    
    fig2 =  px.box(data_frame = com_data, x = column,notched = True,orientation = 'h')
    fig2.update_layout(xaxis_title = f'{column}',
                       yaxis_title = 'Value',
                       showlegend = True
                      )
    
    
    fig2.show()

In [None]:
for column in com_data.select_dtypes(include = ['object']).columns:
    print(column)
    
  
    plt.figure(figsize = (25,4))
    sns.countplot(data = com_data, x = com_data[f'{column}'], palette = 'viridis')
    plt.xlabel(f'{column}')
    plt.xticks(rotation = 90)
    plt.title(f'Count-Plot for {column}')

    plt.show()

In [None]:
# Identify categorical columns (typically of type 'object' or 'category')
categorical_columns = com_data.select_dtypes(include=['object']).columns
print(len(categorical_columns))

# Create a dictionary to store unique values for each categorical column
unique_values = {}

for col in categorical_columns:
    unique_values[col] = com_data[col].unique()

# Print the results
for col, values in unique_values.items():
    print(f"Column: {col}")
    print(f"Unique Values: {values}")
    print()


### **Handling missing values of Numerical/Quantitative  Features**

In [None]:
# from sklearn.impute import KNNImputer
# from sklearn.preprocessing import QuantileTransformer


In [None]:
# ## since there are too many missing values in the numerical features we will use KNN to fill missing values 
# n_features = com_data.select_dtypes(include=['float64']).columns    

# # Initialize the KNNImputer
# imputer = KNNImputer(n_neighbors=5)
# com_data[n_features] = imputer.fit_transform(com_data[n_features])


In [None]:
# com_data[n_features].isna().sum()

### **Removing Noise And Handling missing values of the Categorical Features**

#### Converting categorical values to numerical