<img src="https://media.giphy.com/media/PAqjdPkJLDsmBRSYUp/giphy.gif" width=80%>

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
def reduce_memory_usage(df):
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    return df

In [None]:
df=pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
reduce_memory_usage(df)
test=pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
reduce_memory_usage(test)

In [None]:
df.head()

<img src="https://media.giphy.com/media/l4RKhOL0xiBdbgglFi/giphy.gif" width=50%>

In [None]:
df.describe().T

# Checking for NULLs in the data

In [None]:
df.isnull().sum()

# Checking for data types in the DataFrame

In [None]:
df.info()

In [None]:
for col in df.columns:
    print(f"The total unique values in {col} are {len(df[col].unique())}")

As **Soil_Type7** and **Soil_Type15** are  having only 1 type of data need to be removed from the data frame

In [None]:
df.drop(["Soil_Type7","Soil_Type15"],axis=1,inplace=True)

<img src="https://media.giphy.com/media/XfnuZsoKN5VCjyynHn/giphy.gif">

# Data Description
- Elevation - Elevation in meters
- Aspect - Aspect in degrees azimuth
- Slope - Slope in degrees
- Horizontal_Distance_To_Hydrology - Horz Dist to nearest surface water features
- Vertical_Distance_To_Hydrology - Vert Dist to nearest surface water features
- Horizontal_Distance_To_Roadways - Horz Dist to nearest roadway
- Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice
- Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice
- Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice
- Horizontal_Distance_To_Fire_Points - Horz Dist to nearest wildfire ignition points
- Wilderness_Area (4 binary columns, 0 = absence or 1 = presence) - Wilderness area designation
- Soil_Type (40 binary columns, 0 = absence or 1 = presence) - Soil Type designation
- Cover_Type (7 types, integers 1 to 7) - Forest Cover Type designation

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(df.Cover_Type)
plt.plot()

As we can see there is imbalance in the dataset

In [None]:
df['Cover_Type'].value_counts(ascending=False)

# EDA

In [None]:
try:
    fig, axes=plt.subplots(2,5,figsize=(30,15))
    j=0
    i=0
    for k in range(1,11):
        if j==5:
            i+=1
            j=0
        sns.kdeplot(df.loc[:,df.columns[k]],ax=axes[i,j])
        plt.gca().set_title(f"{df.columns[k]}")
        j+=1
except:
    print("Got all the columns")

# Getting Outliers

<img src="https://media.giphy.com/media/OSUuEuaz0imBy/giphy.gif">

In [None]:
def outlier_function(df, col_name):
    first_quartile = np.percentile(np.array(df[col_name].tolist()), 25)
    third_quartile = np.percentile(np.array(df[col_name].tolist()), 75)
    IQR = third_quartile - first_quartile
    
    upper_limit = third_quartile+(3*IQR)
    lower_limit = first_quartile-(3*IQR)
    outlier_count = 0
    
    for value in df[col_name].tolist():
        if (value < lower_limit) | (value > upper_limit):
            outlier_count += 1
    return lower_limit, upper_limit, outlier_count

In [None]:
for col in  df.columns[:10]:
    out=outlier_function(df,col)
    if out[2]>0:
        print(f"There are {out[2]} outliers in {col}")

In [None]:
try:
    fig_out, axes_out=plt.subplots(2,5,figsize=(30,15))
    j=0
    i=0
    for k in range(1,11):
        if j==5:
            i+=1
            j=0
        sns.boxplot(y=df.columns[k],x=df.columns[-1],data=df,ax=axes_out[i,j])
        plt.gca().set_title(f"{df.columns[k]}")
        j+=1
except:
    print("Got all the columns")

In [None]:
sns.heatmap(df.corr())

# Model Training

In [None]:
cb_params = {'iterations': 10000,
             'learning_rate': 0.218904169525507,
             'loss_function': 'MultiClass',
             'eval_metric': 'Accuracy',
             'l2_leaf_reg': 1.6163189485316596,
             'bagging_temperature': 0.14353551008899088,
             'random_strength': 1.29,
             'depth': 10,
             'grow_policy': 'SymmetricTree',
             'leaf_estimation_method': 'Gradient',
             'od_type': 'Iter',
             'early_stopping_rounds': 300,
             'border_count': 254,
             'use_best_model': True,
             'min_data_in_leaf': 150,
             'task_type': 'GPU',
             'random_seed': 42}

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import pickle
from sklearn import model_selection
from catboost import CatBoostClassifier

# Creating Data for Minority Classes

In [None]:
df=pd.concat([df,
              df[df["Cover_Type"]==5],
              df[df["Cover_Type"]==5],
              df[df["Cover_Type"]==5],
              df[df["Cover_Type"]==5],
              df[df["Cover_Type"]==5],
              df[df["Cover_Type"]==5],
              df[df["Cover_Type"]==4],
              df[df["Cover_Type"]==4],
              df[df["Cover_Type"]==4],
              df[df["Cover_Type"]==4],
              df[df["Cover_Type"]==4],
              df[df["Cover_Type"]==4]],ignore_index=True)

In [None]:
test.drop(["Soil_Type7","Soil_Type15"],axis=1,inplace=True)

## Thanks for the [Discussion](https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/293373) , Providing such usefull insites of the data 

In [None]:
df["Aspect"][df["Aspect"] < 0] += 360
df["Aspect"][df["Aspect"] > 359] -= 360

test["Aspect"][test["Aspect"] < 0] += 360
test["Aspect"][test["Aspect"] > 359] -= 360

In [None]:
df.loc[df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
test.loc[test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

df.loc[df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
test.loc[test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

df.loc[df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
test.loc[test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

df.loc[df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
test.loc[test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

df.loc[df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
test.loc[test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

df.loc[df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
test.loc[test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

In [None]:
feature_col=df.columns[1:-1]
X=df[feature_col]
y=df["Cover_Type"]

In [None]:
X_test=test[feature_col]

# 👍 Building the CatBoost Model


<img src="https://media.giphy.com/media/lJNoBCvQYp7nq/giphy.gif">

In [None]:
%%time
# Setting up fold parameters
splits = 5
skf = model_selection.StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

# Creating an array of zeros for storing "out of fold" predictions
oof_preds = np.zeros((X.shape[0],))
preds = np.zeros((X_test.shape[0],len(np.unique(y))))
model_fi = 0
total_mean_acc = 0

# Generating folds and making training and prediction for each of 10 folds
for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    model = CatBoostClassifier(**cb_params)
    model.fit(X_train, y_train,
              verbose=False,
              eval_set=(X_valid, y_valid),
              )
    
    # Getting mean test data predictions (i.e. devided by number of splits)
    preds += model.predict_proba(X_test) / splits
    
    # Getting mean feature importances (i.e. devided by number of splits)
    model_fi += model.feature_importances_ / splits
    
    # Getting validation data predictions. Each fold model makes predictions on an unseen data.
    # So in the end it will be completely filled with unseen data predictions.
    # It will be used to evaluate hyperparameters performance only.
    
    oof_preds[valid_idx] = model.predict(X_valid).flatten()
    
    # Getting score for a fold model
    fold_acc = accuracy_score(y_valid, oof_preds[valid_idx])
    
    print(f"Fold {num} accuracy: {fold_acc}")
    print(classification_report(y_valid,oof_preds[valid_idx]))
    
    # Getting mean score of all fold models (i.e. devided by number of splits)
    total_mean_acc += fold_acc / splits
    
print(f"\nOverall ROC AUC: {total_mean_acc}")

In [None]:
plt.figure(figsize=(45,30))
plt.rcParams.update({'font.size': 30})
idxs = np.argsort(model_fi)
plt.title("Feature Importance")
plt.barh(range(len(idxs)),model_fi[idxs],align="center")
plt.yticks(range(len(idxs)),[feature_col[i] for i in idxs])
plt.xlabel("Random Forest Feature Importance")
plt.tight_layout()
plt.show()

In [None]:
test.shape

In [None]:
result=pd.DataFrame(model.predict(X_test))

In [None]:
submit=pd.concat([pd.DataFrame(test["Id"]),result],axis=1)

In [None]:
submit.columns=["Id","Cover_Type"]

In [None]:
submit.head()

In [None]:
submit.to_csv("submission.csv",index=False)

# Please Upvote if you Liked what you saw!! Helps a lot😁

<img src="https://media.giphy.com/media/4LM3elgbccSje/giphy.gif">