In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

# `00` Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report

# `01` Read Data 

In [None]:
df = pd.read_csv('../input/drybeansclassification/train.csv')
df.head()


# `02` EDA

### `2.0` Explore Data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.y.unique()

### `2.1` Check Nulls

In [None]:
# Check Nulls
((df.isnull().sum()/df.shape[0])*100)

### `2.2` Check Duplicates

In [None]:
print(df.duplicated().sum())

### `2.3` Check Imbalance

In [None]:
print(df['y'].value_counts())
_ = sns.countplot(x='y', data=df)

### `2.4` Check Normality

In [None]:
plt.figure(figsize=(30, 30))
for i, col in enumerate(df.iloc[:,1:-1]):
    plt.subplot(7, 4, i+1)
    sns.histplot(df[col], kde=True, bins=10)

### `2.5` Check Outliers

In [None]:
df.columns[0]

In [None]:
plt.figure(figsize=(20, 30))

for i, col in enumerate(df.iloc[:,1:-1]):
    plt.subplot(8, 3, i+1)
    sns.boxplot(x = df[col])


### `2.6` Check Corrolation

In [None]:
# making a heatmap to inspect the highly correlated features
corr_matrix = df.corr()
plt.subplots(figsize = (15, 10))
sns.heatmap(corr_matrix, 
           xticklabels = corr_matrix.columns.values,
           yticklabels = corr_matrix.columns.values, 
           linewidth = 0.1)

## **Observations :**

<b>

1) No missing Values

2) No Duplicates

3) The Data Suffers from Skeweness

4) The Data Suffers from Outliers

5) There is High correlation between Features

6) Data is Imblanaced

**So Tree_based Algorithms Would be the most appropriate path to take.**

</b>


# `03` Feature Engineering

### `3.1` Response Variable Encoding

In [None]:
label_encoder = LabelEncoder()
df['y']= label_encoder.fit_transform(df['y'])

### `3.2` Outliers handeling

In [None]:
def handle_outliers_fit(df, col, coeff):
    
    # IQR
    Q1 = np.percentile(df[col], 25, interpolation = 'midpoint')

    Q3 = np.percentile(df[col], 75, interpolation = 'midpoint')

    IQR = Q3 - Q1

    # Upper bound
    upper_bound = Q3 + coeff * IQR

    # Lower bound
    lower_bound = Q1 - coeff * IQR
    
    return upper_bound, lower_bound

In [None]:
# Removing outliers
indeces_dict = {}
for feature in list(set(df.columns.to_list()) - set(['ID', 'y'])):
   
    current_feature = df[feature]

    upper_bound, lower_bound = handle_outliers_fit(df, feature, 1.5)

    # Outliers indeces
    outliers = df[((current_feature > upper_bound) | (current_feature < lower_bound))]
    outliers_id = outliers['ID']
    indeces_dict.update({feature: outliers_id})
    
    # Checking prediction patterns for each feature's outlier
    print('{}:\n{}\n'.format(feature, outliers['y'].value_counts()))
    


In [None]:
# Extra features for splitting outliers
'''
EquivDiameter: 1
Perimeter: 1
MajorAxisLength: 1

AspectRation: 4
Extent: 4

ShapeFactor3: 5
Eccentricity: 5

'''

indeces_outliers_class_1 = [set(indeces_dict['EquivDiameter'].to_list()), 
                            # set(indeces_dict['Perimeter'].to_list()), 
                            set(indeces_dict['MajorAxisLength'].to_list())]

indeces_outliers_class_4 = [set(indeces_dict['AspectRation'].to_list()),
                            set(indeces_dict['Extent'].to_list())]

indeces_outliers_class_5 = [set(indeces_dict['ShapeFactor3'].to_list()), 
                            set(indeces_dict['Eccentricity'].to_list())]

indeces_outliers_class_1 = list(set.union(* indeces_outliers_class_1))
indeces_outliers_class_4 = list(set.union(* indeces_outliers_class_4))
indeces_outliers_class_5 = list(set.union(* indeces_outliers_class_5))

In [None]:
flags_df = pd.DataFrame(np.zeros((df.shape[0], 3)), columns = ['Flag_outliers_1', 'Flag_outliers_4', 'Flag_outliers_5']) 



for Id in indeces_outliers_class_1:
    flags_df.at[Id, 'Flag_outliers_1'] = 1.0
    
for Id in indeces_outliers_class_4:
    flags_df.at[Id, 'Flag_outliers_4'] = 1.0
    
for Id in indeces_outliers_class_5:
    flags_df.at[Id, 'Flag_outliers_5'] = 1.0

flags_df

In [None]:
df_new = df
df_new = df_new.merge(flags_df, right_index = True, left_index = True, how = 'inner')
df = df_new
df

In [None]:
df['Flag_outliers_1'].value_counts()

In [None]:
df['Flag_outliers_4'].value_counts()

In [None]:
df['Flag_outliers_5'].value_counts()

### X and y data

In [None]:
X = df.drop(['ID', 'y'], axis=1)
y = df['y']

### `3.3` Handeling Imbalanced Data

In [None]:
print(df['y'].value_counts())
_ = sns.countplot(x='y', data=df)

In [None]:
#imbalanced data 
from sklearn.ensemble import RandomForestClassifier 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

'''k=10 n=1800'''
over = SMOTE(sampling_strategy = {0:1700, 1:1700, 2:1700, 4:1700, 5:1700}, k_neighbors=10, random_state=42)
X_res, y_res = over.fit_resample(X ,y)

under = RandomUnderSampler(sampling_strategy = {3:1700, 6:1700}, random_state=42)
X_res, y_res = under.fit_resample(X_res, y_res)

_ = sns.countplot(x=y_res, data=df)

### `3.4` Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X_res, y_res)
print(model.feature_importances_)

In [None]:
feat_imp = pd.Series(model.feature_importances_, index=X_res.columns)
feat_imp.nlargest(35).plot(kind='barh')

In [None]:
# X_res = X_res.drop(columns = ['Solidity'])

# `04` Model Selection

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.1, random_state = 42)

In [None]:
# #  Lazy Predicror for best model:

# from lazypredict.Supervised import LazyClassifier

# from sklearn.utils import shuffle

# # from sklearn import datasets


# cls_lazy = LazyClassifier(random_state = 42)

# models, predictions = cls_lazy.fit(X_train, X_test, y_train, y_test)



# print(models)


# `05` Hyper-Parameters Tuning 

In [None]:
'''
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer



params = {'depth':[3,1,2,6,4,5,7,8,9,10],
              'iterations':[250,100,500,1000],
              'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3],
              'l2_leaf_reg':[3,1,5,10,100],
              'border_count':[32,5,10,20,50,100,200],
              'bagging_temperature':[0.03,0.09,0.25,0.75],
              'random_strength':[0.2,0.5,0.8],
              'max_ctr_complexity':[1,2,3,4,5]}



model = CatBoostClassifier(random_state=9, task_type='CPU')
grid_search_result = model.grid_search(params,
                                       X=X_res,
                                       y=y_res,
                                       cv=10,
                                       partition_random_seed=42,
                                       stratified=True)'''

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
       
# from catboost import CatBoostClassifier

In [None]:
# model = CatBoostClassifier(random_state=42, task_type='CPU')
# params = {'depth':[1,2,3],
#           'iterations':[500,1000],
#           'learning_rate':[0.01,0.1,0.2,0.3]}

# randm = RandomizedSearchCV(estimator=model, 
#                            param_distributions = params, 
#                            cv = 10, 
#                            n_iter = 20, 
#                            n_jobs=-1, 
#                            random_state=42,
#                            verbose=3)
# randm.fit(X_res, y_res)

# # Results from Random Search
# print("\n========================================================")
# print(" Results from Random Search " )
# print("========================================================")    

# print("\n The best estimator across ALL searched params:\n",
#       randm.best_estimator_)

# print("\n The best score across ALL searched params:\n",
#       randm.best_score_)


In [None]:
# randm.best_params_

In [None]:
# model.fit(X_train, Y_train)

In [None]:
# # Best hyperparameters from the grid search
# grid_search.best_params_

# `06` Finaaaaally Train Our Model

## `Using CatBoost`

In [None]:
from catboost import CatBoostClassifier 
from sklearn.metrics import f1_score

In [None]:
# # class_wghts = {
# #                0: 1,
# #                1: 1,
# #                2: 1,
# #                3: 0.7,
# #                4: 1,
# #                5: 1,
#             #    6: 0.7}
# class_wghts = {0: 0.9,
#                1: 1.,
#                2: 0.8,
#                3: 0.4,
#                4: 0.7,
#                5: 0.6,
#                6: 0.5}

In [None]:
# 40 47 # iterations=1000, task_type="CPU", learning_rate=0.03, random_state=9, depth=3

cat = CatBoostClassifier(iterations=1000, task_type="CPU", learning_rate=0.1, random_state=9, depth=3).fit(X_train, y_train, verbose=False)
y_pred_cat = cat.predict(X_test)

### Train Data Evaluation

In [None]:
train_preds = cat.predict(X_train)
report_cb = classification_report(y_train, train_preds, output_dict=True)
report = pd.DataFrame(report_cb).transpose()
report
#({0: 'BARBUNYA', 1: 'BOMBAY', 2: 'CALI', 3: 'DERMASON', 4: 'HOROZ', 5: 'SEKER', 6: 'SIRA'})

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(cat, X_train, y_train,ax=ax,cmap='Blues')  
plt.xticks(rotation=30)
plt.title('Catboost Confusion Matrix')
plt.show() 

### Validation Data Evaluation

In [None]:
train_preds = cat.predict(X_test)
report_cb = classification_report(y_test, train_preds, output_dict=True)
report = pd.DataFrame(report_cb).transpose()
report
# ({0: 'BARBUNYA', 1: 'BOMBAY', 2: 'CALI', 3: 'DERMASON', 4: 'HOROZ', 5: 'SEKER', 6: 'SIRA'})

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(cat, X_test, y_test,ax=ax,cmap='Blues')  
plt.xticks(rotation=30)
plt.title('Catboost Confusion Matrix')
plt.show() 

# `07` Prepare Test Data

In [None]:
test_df = pd.read_csv('../input/drybeansclassification/test.csv').set_index('ID', drop=False)

test_df = test_df.rename_axis(None)

test_df

### `7.1` Handling outliers in test data

In [None]:
# Removing outliers
indeces_dict_test = {}
for feature in ['EquivDiameter', 'Perimeter', 'MajorAxisLength', 'AspectRation', 'Extent', 'ShapeFactor3', 'Eccentricity']:
   
    current_feature = test_df[feature]
    
    upper_bound, lower_bound = handle_outliers_fit(df, feature, 1.5)
    
    # Outliers indeces
    outliers_test = test_df[((current_feature > upper_bound) | (current_feature < lower_bound))] #.index
    outliers_id_test = outliers_test.index
    indeces_dict_test.update({feature: outliers_id_test})



In [None]:
# Extra features for splitting outliers
'''
EquivDiameter: 1
Perimeter: 1
MajorAxisLength: 1

AspectRation: 4
Extent: 4

ShapeFactor3: 5
Eccentricity: 5

'''

indeces_outliers_class_1_test = [set(indeces_dict_test['EquivDiameter'].to_list()), 
                                 set(indeces_dict_test['Perimeter'].to_list()), 
                                 set(indeces_dict_test['MajorAxisLength'].to_list())]

indeces_outliers_class_4_test = [set(indeces_dict_test['AspectRation'].to_list()), 
                                set(indeces_dict_test['Extent'].to_list())]

indeces_outliers_class_5_test = [set(indeces_dict_test['ShapeFactor3'].to_list()), 
                                 set(indeces_dict_test['Eccentricity'].to_list())]

indeces_outliers_class_1_test = list(set.union(* indeces_outliers_class_1_test))
indeces_outliers_class_4_test = list(set.union(* indeces_outliers_class_4_test))
indeces_outliers_class_5_test = list(set.union(* indeces_outliers_class_5_test))

In [None]:
flags_df_test = pd.DataFrame(np.zeros((test_df.shape[0], 3)), columns = ['Flag_outliers_1', 'Flag_outliers_4', 'Flag_outliers_5']) 
flags_df_test = flags_df_test.merge(test_df['ID'].reset_index().drop(columns='index'), right_index = True, left_index = True, how = 'inner')
flags_df_test = flags_df_test.set_index('ID', drop=True).rename_axis(None)
flags_df_test

In [None]:
for Id in indeces_outliers_class_1_test:
    flags_df_test.at[Id, 'Flag_outliers_1'] = 1.0
    
for Id in indeces_outliers_class_4_test:
    flags_df_test.at[Id, 'Flag_outliers_4'] = 1.0
    
for Id in indeces_outliers_class_5_test:
    flags_df_test.at[Id, 'Flag_outliers_5'] = 1.0

flags_df_test

In [None]:
df_new_test = test_df
df_new_test = df_new_test.merge(flags_df_test, right_index = True, left_index = True, how = 'inner')

test_df = df_new_test
test_df

In [None]:
test_df['Flag_outliers_1'].value_counts()

In [None]:
flags_df_test['Flag_outliers_4'].value_counts()

In [None]:
flags_df_test['Flag_outliers_5'].value_counts()

### `7.2` Prepare For Submission

In [None]:
df_submission = pd.DataFrame()
df_submission["ID"] = test_df["ID"]
df_submission

In [None]:
test_df = test_df.drop(columns = ['ID'])
test_df = pd.DataFrame(test_df)
y_pred_submission = cat.predict(test_df)

df_submission["y"]= y_pred_submission
df_submission["y"] = df_submission["y"].map({0: 'BARBUNYA', 1: 'BOMBAY', 2: 'CALI', 3: 'DERMASON', 4: 'HOROZ', 5: 'SEKER', 6: 'SIRA'})

df_submission

In [None]:
df_submission[['ID', 'y']].to_csv('submission.csv', index=False)