## Imports

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
# from google.colab import files
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from scipy import sparse
# import kaggle
sns.set_style("darkgrid")
pd.options.display.float_format = '{:.2f}'.format

In [None]:
# files.upload() 
# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/kaggle.json
# !chmod 600 /root/.kaggle/kaggle.json
# !kaggle competitions download tabular-playground-series-dec-2021

# Intro

reduce memory usage

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = pd.read_csv("train.csv")
train = reduce_mem_usage(train)
test = pd.read_csv("test.csv")
test = reduce_mem_usage(test)

train.head()

In [None]:
# target - Cover_Type
# 54 features except ID
train.info()

# Some geographical notations:

* **Altitude** (**slope**)

The altitude is the slope or angle of the light source to the horizon, from 0 degree(on the horizon) to 90 degrees (overhead). The default is 45 degrees, please see the figure below.

 * **Hillshades**


Shaded relief, or hillshading, is a technique where a lighting effect is added to a map based on elevation variations within the landscape. It provides a clearer picture of the topography by mimicing the sun’s effects (illumination, shading and shadows) on hills and canyons.

https://earthquake.usgs.gov/education/geologicmaps/images/hillshades.jpg

* **Elevation** 

is distance above sea level. Elevations are usually measured in meters or feet. They can be shown on maps by contour lines, which connect points with the same elevation; 

In [None]:
np.unique(train.Cover_Type)
viz = train.groupby(['Cover_Type']).agg({'Cover_Type':'count'})\
           .rename(columns = {'Cover_Type':'cnt'}).reset_index()
sns.barplot(data= viz,x = 'Cover_Type', y = 'cnt'), print(viz)
# very unbalanced sample

# Numerical Features

In [None]:
# Exclude categorial
col = train.columns
not_soil_col = [i for i in col if 'Soil' not in i]
not_soil_col = [i for i in not_soil_col if 'Id' not in i ]
not_soil_col = [i for i in not_soil_col if 'Cover_Type' not in i ]
not_soil_col = [i for i in not_soil_col if 'Wilder' not in i ]
not_soil_col

In [None]:
train.loc[:,not_soil_col].describe()

Distribution of numerical features for different cover type

In [None]:
train['target_cat'] = train.Cover_Type.astype('category')
rand_row = random.sample(range(len(train.Cover_Type)), 100000)
sample_for_viz = train.iloc[rand_row, :]

for feature in not_soil_col:
  if feature != 'target_cat':
    sns.displot(data = sample_for_viz, x = feature, hue = 'target_cat', kind = 'kde')
    plt.show()
  else:
    pass  

Correlation matrix for target and numerical features

In [None]:
plt.figure(figsize=(8,8), dpi = 150)
corr_matrix = train.loc[:, ['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points', 'Cover_Type']].corr().round(3)
sns.heatmap(corr_matrix, vmin = -1, vmax = 1, linewidths=0.5, cmap = 'Reds_r', annot=True, center=0)

In [None]:
plt.figure(figsize=(5,5), dpi = 150)
plt.subplot(131)
sns.scatterplot(data = sample_for_viz, x = 'Horizontal_Distance_To_Hydrology', y = 'Vertical_Distance_To_Hydrology', hue = 'target_cat')
plt.subplot(132)
sns.scatterplot(data = sample_for_viz, x = 'Horizontal_Distance_To_Hydrology', y = 'Horizontal_Distance_To_Fire_Points', hue = 'target_cat')
plt.subplot(133)
sns.scatterplot(data = sample_for_viz, x = 'Horizontal_Distance_To_Roadways', y = 'Horizontal_Distance_To_Fire_Points', hue = 'target_cat')
plt.subplots_adjust(left=None, bottom=None, right=3, top=None, wspace=1, hspace=None)

# Categorial features

Wilderness Area

In [None]:
wild_area_t = [i for i in train.columns if 'Wilder' in i]
print(wild_area_t)
for feature in wild_area_t:
  print('Number of unique values: ', np.unique(train[feature]))
  # train[feature] = train[feature].astype('int')
train.loc[:,wild_area_t].head()

In [None]:
viz = train.agg({'Wilderness_Area1':'sum',	'Wilderness_Area2':'sum',	'Wilderness_Area3':'sum',	'Wilderness_Area4':'sum'})
viz.plot(kind = 'bar', color = 'aquamarine', edgecolor = 'black')
# wildernes area count
train['Wilderness_Area_cnt'] = train.loc[:,wild_area_t].agg(sum)

In [None]:
viz = train.groupby(['target_cat']).agg({'Wilderness_Area1':'sum',	'Wilderness_Area2':'sum',	'Wilderness_Area3':'sum',	'Wilderness_Area4':'sum'})\
           .rename(columns = {'Cover_Type':'cnt'}).reset_index()
viz.set_index('target_cat').transpose().plot(kind = 'bar')

In [None]:
viz.set_index('target_cat').transpose()

Solar Type

In [None]:
plt.figure(figsize=(5,6), dpi = 150)
soil_t = [i for i in train.columns if 'Soil_Type' in i]
soil_data = train.loc[:,soil_t].sum().sort_values(ascending=True)
soil_data.plot(kind = 'barh', color = 'orange', edgecolor = 'black')
plt.rcParams.update({'font.size': 8})

In [None]:
plt.figure(figsize=(12,12), dpi = 150)
aggregating = dict(zip(soil_t, ['sum']*len(soil_t)))
soil_data_by_target = train.groupby(['target_cat']).agg(aggregating)
soil_data_by_target

In [None]:
soil_data_by_target.transpose().plot(kind = 'bar', figsize=(24,12), width = 1, edgecolor = 'black') 

# Models

In [None]:
train = train.query('Cover_Type != 5')
X = train.drop(['target_cat', 'Cover_Type', 'Id', 'Soil_Type8', 'Soil_Type15'], axis=1)
y = train.loc[:, 'Cover_Type']
# y = LabelBinarizer().fit_transform(y) # from sklearn
# y = sparse.csr_matrix(y) # from scipy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 12)

In [None]:
(pd.DataFrame(y_train).groupby('Cover_Type').agg({'Cover_Type': 'count'})/len(y_test)).plot(kind = 'bar')
(pd.DataFrame(y_test).groupby('Cover_Type').agg({'Cover_Type': 'count'})/len(y_test)).plot(kind = 'bar')

In [None]:
pd.DataFrame(y_train).groupby('Cover_Type').agg({'Cover_Type': 'count'})

In [None]:
pd.DataFrame(y_test).groupby('Cover_Type').agg({'Cover_Type': 'count'})

Resampling -- trying to reduce size of dataset and correct imbalances without loosing generality

In [None]:
from sklearn.utils import resample 
dominating_classes = pd.DataFrame(y_train).query('Cover_Type == 1 | Cover_Type == 2') # more than half of train set is the class 1 or 2
minority_classes = pd.DataFrame(y_train).query('Cover_Type != 1 & Cover_Type != 2') 
resampled_domin_classes = resample(dominating_classes, 
                                 replace=False,    
                                 n_samples=400000)
resampled_domin_classes

In [None]:
y_train_down = pd.concat([resampled_domin_classes, minority_classes], axis = 0)
X_train_down = X_train.loc[y_train_down.index,:]
y_train_down.groupby('Cover_Type').agg({'Cover_Type': 'count'}).plot(kind = 'bar')
plt.show()
y_train_down.groupby('Cover_Type').agg({'Cover_Type': 'count'})

In [None]:
# X_train.iloc[y_train_down.index,:]


In [None]:
y_train_down.groupby('Cover_Type').agg({'Cover_Type': 'count'})


In [None]:
# classes = np.unique(y)
# for c in classes:
#   y_train_c = y_train.apply(lambda x: 1 if x == c else 0)
#   log_reg = LogisticRegression(C = 0.01)
#   log_reg.fit(X_train, y_train_c)
#   accuracy = metrics.accuracy_score(log_reg.predict(X_train), y_train)
#   print('train accuracy = ', accuracy)

In [None]:
# classes = np.unique(y)
# for c in classes:
#   y_train_c = y_train.apply(lambda x: 1 if x == c else 0)
#   log_reg = SGDClassifier(loss='log', penalty='l2', learning_rate = 'optimal', alpha = 0.0001, n_jobs = -1)
#   log_reg.fit(X_train, y_train_c)
#   accuracy = metrics.accuracy_score(log_reg.predict(X_train), y_train)
#   print('train accuracy = ', accuracy)

In [None]:
class_weights = y_train_down.groupby('Cover_Type').agg({'Cover_Type': 'count'})/len(y_train_down)
class_weights = dict(class_weights.Cover_Type)
class_weights

## Trees and Random Forest

In [None]:
# from sklearn.model_selection import GridSearchCV

# hyperparam_rf = {'n_estimators': [i for i in range(50,120,20)],
#               'max_depth': [i for i in range(20,41,10)],
#               'min_samples_leaf': [i for i in range(1,3)]
#               }
# hyperparam_rf              

In [None]:
[i for i in range(50,200,10)]

In [None]:
# rf = RandomForestClassifier(warm_start = True, n_jobs = -1, class_weight = class_weights)
# clf = GridSearchCV(rf, hyperparam_rf, cv = 3, verbose = 3) # strtified
# # clf
# clf.fit(X_train_down, y_train_down.Cover_Type)

In [None]:
# print('GSCV best score: ',clf.best_score_)
# clf.best_params_

In [None]:
print(classification_report(y_train_down, clf.predict(X_train_down)))

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

Submit 1

In [None]:
test = 
submis = pd.DataFrame(clf.predict(test.drop(['Id', 'Soil_Type8', 'Soil_Type15'], axis=1)))\
    .rename(columns = {0:'Cover_Type'})
# submis

In [None]:
pd.concat([test['Id'], submis], axis = 1).to_csv('grid_search_RF_10_12_2021_19_22.csv', index = False)

In [None]:
rf = RandomForestClassifier(n_estimators = 110, max_depth = 40, warm_start = True,
                            n_jobs = -1, class_weight=class_weights,min_samples_leaf = 1)
rf.fit(X_train_down, y_train_down.Cover_Type)

In [None]:
plt.figure(figsize=(10,10), dpi =70)
fi_rf = pd.DataFrame(dict(zip(X_train.columns,list(rf.feature_importances_))), index = ['FeatureImportance'])
fi_rf = fi_rf.transpose().sort_values(by = 'FeatureImportance', ascending=0).iloc[:11,]
sns.barplot(data = fi_rf.transpose(),orient='h')

## Boosting

In [None]:
import lightgbm

In [None]:
lgb = lightgbm.LGBMClassifier(max_depth=10, class_weight = class_weights)
lgb.fit(X_train_down, y_train_down.Cover_Type)

In [None]:
print(classification_report(y_train_down, lgb.predict(X_train_down)))

In [None]:
print(classification_report(y_train, lgb.predict(X_train)))

In [None]:
print(classification_report(y_test, lgb.predict(X_test)))

In [None]:
hyperparam_lgmb = {'n_estimators': [i for i in range(300,320,10)],
              'max_depth': [i for i in range(60,64,3)],
              'num_leaves': [i for i in range(100,140,10)]
              }
hyperparam_lgmb    

In [None]:
lgbm = lightgbm.LGBMClassifier(class_weight = class_weights, n_jobs = -1)
clf_lgbm = GridSearchCV(lgbm, hyperparam_lgmb, cv = 3, verbose = 3) # strtified
clf_lgbm.fit(X_train_down, y_train_down.Cover_Type)

In [None]:
print('GSCV LGBM best score: ',clf_lgbm.best_score_)
clf_lgbm.best_params_

In [None]:
print(classification_report(y_test, clf_lgbm.predict(X_test)))

In [None]:
test.loc[:,wild_area_t].agg(sum, axis = 1)
submis = pd.DataFrame(clf_lgbm.predict(test.drop(['Id', 'Soil_Type8', 'Soil_Type15'], axis=1)))\
    .rename(columns = {0:'Cover_Type'})
submis
pd.concat([test['Id'], submis], axis = 1).to_csv('grid_search_LGBM_14_12_2021_00_50.csv', index = False)

In [None]:
test.loc[:,wild_area_t].agg(sum, axis = 1)
submis = pd.DataFrame(clf_lgbm.predict(test.drop(['Id', 'Soil_Type8', 'Soil_Type15'], axis=1)))\
    .rename(columns = {0:'Cover_Type'})
submis
pd.concat([test['Id'], submis], axis = 1).to_csv('grid_search_LGBM_14_12_2021_18_50.csv', index = False)

In [None]:
train.columns

In [None]:
lgbm_2 = lightgbm.LGBMClassifier(class_weight = class_weights, n_jobs = -1, 
                              max_depth =  39, n_estimators = 220, num_leaves = 80)
lgbm_2.fit(X_train_down, y_train_down.Cover_Type)

In [None]:
lgbm_2 = lightgbm.LGBMClassifier(class_weight = class_weights, n_jobs = -1, 
                              max_depth =  60, n_estimators = 310, num_leaves = 130)
lgbm_2.fit(X_train_down, y_train_down.Cover_Type)

In [None]:
plt.figure(figsize=(10,10), dpi =70)
plt.subplot(121)
fi_lgbm = pd.DataFrame(dict(zip(X_train.columns,list(lgbm_2.feature_importances_))), index = ['FeatureImportance'])
fi_lgbm = fi_lgbm.transpose().sort_values(by = 'FeatureImportance', ascending=0).iloc[:11,]
sns.barplot(data = fi_lgbm.transpose(),orient='h')
plt.title('LGBM')
# lgbm_2.feature_importances_
plt.subplot(122)
fi_rf = pd.DataFrame(dict(zip(X_train.columns,list(rf.feature_importances_))), index = ['FeatureImportance'])
fi_rf = fi_rf.transpose().sort_values(by = 'FeatureImportance', ascending=0).iloc[:11,]
sns.barplot(data = fi_rf.transpose(),orient='h')
plt.title('RF')
plt.subplots_adjust(left=1, bottom=None, right=2, top=None, wspace=1, hspace=None)

# Logistic Regression

In [None]:
hyperparam_log_reg = {'alpha': [1, 0.01, 0.001, 0.0005, 0.0001, 0.00001]}
hyperparam_log_reg

In [None]:
# not_soil_col.append('Wilderness_Area_cnt')
not_soil_col
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
scaled_numerical = pd.DataFrame(ss.fit_transform(X_train.loc[:,not_soil_col]), index = X_train.index)
scaled_numerical.columns = not_soil_col

scaled_numerical_down = pd.DataFrame(ss.fit_transform(X_train_down.loc[:,not_soil_col]), index = X_train_down.index)
scaled_numerical_down.columns = not_soil_col

X_train_new = pd.concat([scaled_numerical, X_train.drop(columns = not_soil_col)], axis = 1)
X_train_new_down = pd.concat([scaled_numerical_down, X_train_down.drop(columns = not_soil_col)], axis = 1)

test['Wilderness_Area_cnt'] = test.loc[:,wild_area_t].agg(sum, axis = 1)
scaled_numerical_test = pd.DataFrame(ss.fit_transform(test.loc[:,not_soil_col]), index = test.index)
scaled_numerical_test.columns = not_soil_col
test_new = pd.concat([scaled_numerical_test, test.drop(columns = not_soil_col)], axis = 1)


scaled_numerical_valid = pd.DataFrame(ss.fit_transform(X_test.loc[:,not_soil_col]), index = X_test.index)
scaled_numerical_valid.columns = not_soil_col
X_test_new = pd.concat([scaled_numerical_valid, X_test.drop(columns = not_soil_col)], axis = 1)

In [None]:
# log_reg = SGDClassifier(loss = 'log', n_jobs = -1, class_weight=class_weights,warm_start=True, verbose = 1)
# clf_log_reg = GridSearchCV(log_reg, hyperparam_log_reg, cv = 3, verbose = 3 ) # strtified
# clf_log_reg.fit(X_train_new_down, y_train_down.Cover_Type)

In [None]:
print('GSCV best score: ',clf_log_reg.best_score_)
clf_log_reg.best_params_

In [None]:
print(classification_report(y_test, clf_log_reg.predict(X_test_new)))

In [None]:
# !pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
cat_mod = CatBoostClassifier(loss_function='MultiClass', class_weights = class_weights,iterations=10000)
cat_mod.fit(X_train_down, y_train_down.Cover_Type, plot = True, 
            cat_features = list(X_train_down.drop(columns = not_soil_col).columns))

In [None]:
print(classification_report(y_test, cat_mod.predict(X_test)))

In [None]:
test.loc[:,wild_area_t].agg(sum, axis = 1)
submis = pd.DataFrame(cat_mod.predict(test.drop(['Id', 'Soil_Type8', 'Soil_Type15'], axis=1)))\
    .rename(columns = {0:'Cover_Type'})
submis
pd.concat([test['Id'], submis], axis = 1).to_csv('cat_boost_15_12_2021_14_18.csv', index = False)