In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Dataset-1: Telco Customer Churn 

In [2]:
url = 'https://raw.githubusercontent.com/Shankar0x/MultiStackClassifier/main/Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(url)

In [3]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


## Preprocessing dataset 1

In [4]:
df.drop(['customerID'],axis=1,inplace=True)

In [5]:
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges,errors='coerce')
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [7]:
df[np.isnan(df['TotalCharges'])]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,Male,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,Female,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [8]:
# dropping all tenure rows with 0 values
df.drop(labels=df[df['tenure']==0].index,axis=0,inplace=True)
df[df['tenure']==0].index

Int64Index([], dtype='int64')

In [9]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Data Preprocessing

In [13]:
from sklearn import preprocessing
LE = preprocessing.LabelBinarizer()
df['gender'] = LE.fit_transform(list(df['gender']))
df['SeniorCitizen'] = LE.fit_transform(list(df['SeniorCitizen']))
df['Partner'] = LE.fit_transform(list(df['Partner']))
df['Dependents'] = LE.fit_transform(list(df['Dependents']))
df['PhoneService'] = LE.fit_transform(list(df['PhoneService']))
df['PaperlessBilling'] = LE.fit_transform(list(df['PaperlessBilling']))
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,1,0,0,0,34,1,No,DSL,Yes,No,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,1,0,0,0,2,1,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,1,0,0,0,45,0,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,0,0,0,0,2,1,No,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [14]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
OE=OrdinalEncoder()
df['MultipleLines']= OE.fit_transform(df.MultipleLines.values.reshape(-1,1))
df['InternetService'] = OE.fit_transform(df.InternetService.values.reshape(-1,1))
df['OnlineSecurity'] = OE.fit_transform(df.OnlineSecurity.values.reshape(-1,1))
df['OnlineBackup']=OE.fit_transform(df.OnlineBackup.values.reshape(-1,1))
df['DeviceProtection'] = OE.fit_transform(df.DeviceProtection.values.reshape(-1,1))
df['TechSupport'] = OE.fit_transform(df.TechSupport.values.reshape(-1,1))
df['StreamingTV']= OE.fit_transform(df.StreamingTV.values.reshape(-1,1))
df['StreamingMovies'] = OE.fit_transform(df.StreamingMovies.values.reshape(-1,1))
df['Contract']=OE.fit_transform(df.Contract.values.reshape(-1,1))
df['PaymentMethod']= OE.fit_transform(df.PaymentMethod.values.reshape(-1,1))
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1,2.0,29.85,29.85,0
1,1,0,0,0,34,1,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,1.0,0,3.0,56.95,1889.50,0
2,1,0,0,0,2,1,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1,3.0,53.85,108.15,1
3,1,0,0,0,45,0,1.0,0.0,2.0,0.0,2.0,2.0,0.0,0.0,1.0,0,0.0,42.30,1840.75,0
4,0,0,0,0,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.0,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2.0,0.0,2.0,0.0,2.0,2.0,2.0,2.0,1.0,1,3.0,84.80,1990.50,0
7039,0,0,1,1,72,1,2.0,1.0,0.0,2.0,2.0,0.0,2.0,2.0,1.0,1,1.0,103.20,7362.90,0
7040,0,0,1,1,11,0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.0,29.60,346.45,0
7041,1,1,1,0,4,1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,3.0,74.40,306.60,1


In [15]:
# Check the data types of all values in each column to determine if any conversion have to be made.
print(df.dtypes)

gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines       float64
InternetService     float64
OnlineSecurity      float64
OnlineBackup        float64
DeviceProtection    float64
TechSupport         float64
StreamingTV         float64
StreamingMovies     float64
Contract            float64
PaperlessBilling      int64
PaymentMethod       float64
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object


## Label encoding churn dataset

In [16]:
from sklearn import preprocessing
LE = preprocessing.LabelBinarizer()
df['Churn'] = LE.fit_transform(list(df['Churn']))

In [17]:
categorical = ['gender','SeniorCitizen','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod']
continuous = ['tenure','MonthlyCharges','TotalCharges']

## Converting data types to appropriate types

In [18]:
ii8 = np.iinfo(np.int8)
ii16 = np.iinfo(np.int16)
ii32 = np.iinfo(np.int32)
ii64 = np.iinfo(np.int64)

print(ii8.max)
for category in categorical:
    max_value = df[category].max()
    min_value = df[category].min()

    if max_value < ii8.max:
        print("Converting data types of ", category, " to int8 as max value is only : ", max_value)
        df[category] = df[category].astype(np.int8)

    elif max_value < ii16.max:
        print("Converting data types of ", category, " to int16 as max value is only : ", max_value)
        df[category] = df[category].astype(np.int16)

    elif max_value < ii32.max:
        print("Converting data types of ", category, " to int32 as max value is only : ", max_value)
        df[category] = df[category].astype(np.int32)

    elif max_value < ii64.max:
        print("Converting data types of ", category, " to int64 as max value is only : ", max_value)
        df[category] = df[category].astype(np.int64)

print("Data types after performing conversions:")
df.dtypes

127
Converting data types of  gender  to int8 as max value is only :  1
Converting data types of  SeniorCitizen  to int8 as max value is only :  1
Converting data types of  Partner  to int8 as max value is only :  1
Converting data types of  Dependents  to int8 as max value is only :  1
Converting data types of  PhoneService  to int8 as max value is only :  1
Converting data types of  MultipleLines  to int8 as max value is only :  2.0
Converting data types of  InternetService  to int8 as max value is only :  2.0
Converting data types of  OnlineSecurity  to int8 as max value is only :  2.0
Converting data types of  OnlineBackup  to int8 as max value is only :  2.0
Converting data types of  DeviceProtection  to int8 as max value is only :  2.0
Converting data types of  TechSupport  to int8 as max value is only :  2.0
Converting data types of  StreamingTV  to int8 as max value is only :  2.0
Converting data types of  StreamingMovies  to int8 as max value is only :  2.0
Converting data typ

gender                 int8
SeniorCitizen          int8
Partner                int8
Dependents             int8
tenure                int64
PhoneService           int8
MultipleLines          int8
InternetService        int8
OnlineSecurity         int8
OnlineBackup           int8
DeviceProtection       int8
TechSupport            int8
StreamingTV            int8
StreamingMovies        int8
Contract               int8
PaperlessBilling       int8
PaymentMethod          int8
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [19]:
f16 = np.finfo(np.float16)
f32 = np.finfo(np.float32)
f64 = np.finfo(np.float64)

category = 'TotalCharges'

max_value = df[category].max()
min_value = df[category].min()


if max_value < f16.max:
    print("Converting data types of ", category, " to f16 as max value is only : ", max_value)
    df[category] = df[category].astype(np.float16)

elif max_value < f32.max:
    print("Converting data types of ", category, " to f32 as max value is only : ", max_value)
    df[category] = df[category].astype(np.float32)

elif max_value < f64.max:
    print("Converting data types of ", category, " to f64 as max value is only : ", max_value)
    df[category] = df[category].astype(np.float64)

print("New data type of ", category, " : ", df[category].dtype)

Converting data types of  TotalCharges  to f16 as max value is only :  8684.8
New data type of  TotalCharges  :  float16


## Train test split

In [33]:
X = df.iloc[:,:-1]
y = df['Churn']

In [34]:
from sklearn.model_selection import train_test_split as tts 
x_train, x_test, y_train, y_test = tts(X,y,test_size=0.2)

# Simple stacking classifier

A simple way to achieve this is to split your training set in half. Use the first half of your training data to train the level one classifiers. Then use the trained level one classifiers to make predictions on the second half of the training data. These predictions should then be used to train meta-classifier.
src: https://towardsdatascience.com/stacking-classifiers-for-higher-predictive-performance-566f963e4840

In [3]:
def simple_stacker(x_train,y_train, x_test, y_test):
  # Making validation set
  xtraining,xvalid,ytraining,yvalid = tts(x_train,y_train,test_size=0.5)

  from sklearn.ensemble import RandomForestClassifier
  from sklearn.linear_model import LogisticRegression
  import xgboost as xgb
  from sklearn import tree
  from sklearn.neighbors import KNeighborsClassifier

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import StandardScaler
  from sklearn.svm import SVC

  from sklearn.naive_bayes import GaussianNB
  from sklearn.neural_network import MLPClassifier
  from sklearn.ensemble import AdaBoostClassifier

  from sklearn.metrics import accuracy_score

  #specifying the initial learners
  model1 = RandomForestClassifier()
  model2 = xgb.XGBClassifier()
  model3 = AdaBoostClassifier(n_estimators=100, random_state=0)
  # model4 = tree.DecisionTreeClassifier()
  # model5 = KNeighborsClassifier(n_neighbors=3)
  # model6 = make_pipeline(StandardScaler(), SVC(gamma='auto'))
  # model7 = GaussianNB()
  # model8 = MLPClassifier(random_state=1, max_iter=300)
  # model9 = AdaBoostClassifier(n_estimators=100, random_state=0)
  
  #training the initial learners
  model1.fit(xtraining,ytraining)
  model2.fit(xtraining,ytraining)
  model3.fit(xtraining,ytraining)
  
  #making predictions for the validation data
  preds1 = model1.predict(xvalid)
  preds2 = model2.predict(xvalid)
  preds3 = model3.predict(xvalid)

  #making predictions for the test data
  test_preds1 = model1.predict(x_test)
  test_preds2 = model2.predict(x_test)
  test_preds3 = model3.predict(x_test)

  train_stack = np.column_stack((preds1,preds2,preds3))
  test_stack = np.column_stack((test_preds1,test_preds2,test_preds3))

  from sklearn.tree import ExtraTreeClassifier
  final_model = ExtraTreeClassifier(random_state=0).fit(train_stack,yvalid)
  test_preds = final_model.predict(test_stack)
  print(round(accuracy_score(test_preds, y_test)*100,3))


In [70]:
simple_stacker(x_train, y_train, x_test, y_test)

79.815


## Multi stack classifier

In [4]:
def multistack(x, y, K):

  from sklearn.ensemble import RandomForestClassifier
  from sklearn.linear_model import LogisticRegression
  import xgboost as xgb
  from sklearn import tree
  from sklearn.neighbors import KNeighborsClassifier

  from sklearn.pipeline import make_pipeline
  from sklearn.preprocessing import StandardScaler
  from sklearn.svm import SVC

  from sklearn.naive_bayes import GaussianNB
  from sklearn.neural_network import MLPClassifier
  from sklearn.ensemble import AdaBoostClassifier
  import lightgbm as lgb

  from sklearn.metrics import accuracy_score
  multi_x_train = x.copy()
  multi_y_train = y.copy()

  ################################################################################
  #                     TRAINING ALL CANDIDATE MODELS                            #
  ################################################################################


  # DT
  clf = tree.DecisionTreeClassifier()
  clf = clf.fit(multi_x_train, multi_y_train)
  y_pred_DT = clf.predict(multi_x_train)

  #RF
  random_forest = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 0)
  random_forest.fit(multi_x_train, multi_y_train)
  y_pred_RF = random_forest.predict(multi_x_train)

  #XGB
  from xgboost import XGBClassifier
  model = XGBClassifier()
  model.fit(multi_x_train, multi_y_train)
  y_pred_XGB = model.predict(multi_x_train)

  #LGBM
  clf = lgb.LGBMClassifier()
  clf.fit(multi_x_train, multi_y_train)
  y_pred_LGBM = clf.predict(multi_x_train)

  #SVC
  clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
  clf.fit(multi_x_train, multi_y_train)
  y_pred_SVC = clf.predict(multi_x_train)

  #LOGREG
  from sklearn.linear_model import LogisticRegression
  clf = LogisticRegression(random_state=0).fit(multi_x_train, multi_y_train)
  y_pred_LOGREG = clf.predict(multi_x_train)

  #KNN
  model = KNeighborsClassifier(n_neighbors=3)   
  model.fit(multi_x_train, multi_y_train)
  y_pred_KNN = model.predict(multi_x_train)

  # AdaBoost
  from sklearn.ensemble import AdaBoostClassifier
  clf = AdaBoostClassifier(n_estimators=100, random_state=0)
  clf.fit(multi_x_train, multi_y_train)
  y_pred_ADA = clf.predict(multi_x_train)

  multi_x_train = pd.DataFrame()
  multi_x_train['DT'] = y_pred_DT 
  multi_x_train['RF'] = y_pred_RF
  multi_x_train['XGB'] = y_pred_XGB
  multi_x_train['LGBM'] = y_pred_LGBM
  multi_x_train['SVC'] = y_pred_SVC 
  multi_x_train['KNN'] = y_pred_KNN
  multi_x_train['LOGREG'] = y_pred_LOGREG
  multi_x_train['ADABOOST'] = y_pred_ADA

  ################################################################################
  #                     ITERATING OVER META LEARNERS                             #
  ################################################################################


  for i in range(K):
    # DT
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(multi_x_train, multi_y_train)
    y_pred_DT = clf.predict(multi_x_train)

    #RF
    random_forest = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 0)
    random_forest.fit(multi_x_train, multi_y_train)
    y_pred_RF = random_forest.predict(multi_x_train)

    #XGB
    model = XGBClassifier()
    model.fit(multi_x_train, multi_y_train)
    y_pred_XGB = model.predict(multi_x_train)

    #LGBM
    clf = lgb.LGBMClassifier()
    clf.fit(multi_x_train, multi_y_train)
    y_pred_LGBM = clf.predict(multi_x_train)

    #SVC
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    clf.fit(multi_x_train, multi_y_train)
    y_pred_SVC = clf.predict(multi_x_train)

    #LOGREG
    clf = LogisticRegression(random_state=0).fit(multi_x_train, multi_y_train)
    y_pred_LOGREG = clf.predict(multi_x_train)

    #KNN
    model = KNeighborsClassifier(n_neighbors=3)   
    model.fit(multi_x_train, multi_y_train)
    y_pred_KNN = model.predict(multi_x_train)

    # AdaBoost
    clf = AdaBoostClassifier(n_estimators=100, random_state=0)
    clf.fit(multi_x_train, multi_y_train)
    y_pred_ADA = clf.predict(multi_x_train)

    multi_x_train['DT'] = y_pred_DT 
    multi_x_train['RF'] = y_pred_RF
    multi_x_train['XGB'] = y_pred_XGB
    multi_x_train['LGBM'] = y_pred_LGBM
    multi_x_train['SVC'] = y_pred_SVC 
    multi_x_train['KNN'] = y_pred_KNN
    multi_x_train['LOGREG'] = y_pred_LOGREG
    multi_x_train['ADABOOST'] = y_pred_ADA



  import scipy.stats as st
  from sklearn.metrics import accuracy_score
  # max voting
  y_pred_MAX = np.zeros(multi_x_train.shape[0])
  for i in range(multi_x_train.shape[0]):
      y_pred_MAX[i] = st.mode(multi_x_train.iloc[i,:]).mode
  return y_pred_MAX
      

In [15]:
train_preds = multistack(x_train, y_train, 100)
print(train_preds.shape)
print(y_train.shape)
print("Accuracy is: ", round(accuracy_score(y_train, train_preds)*100,2))

NameError: ignored

In [143]:
test_preds = multistack(x_test, y_test, 5)
print(test_preds.shape)
print(y_test.shape)
print("Accuracy is: ", round(accuracy_score(y_train, train_preds)*100,2))

(1407,)
(1407,)
Accuracy is:  99.75


In [3]:
url_cov = 'https://raw.githubusercontent.com/Shankar0x/MultiStackClassifier/main/Dataset/covtype.csv'
covdf = pd.read_csv(url_cov)
covdf

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [7]:
covdf.isnull().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0
Soil_Type11                           0


In [8]:
covdf.isna().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0
Soil_Type11                           0


In [9]:
covdf.dtypes

Elevation                             int64
Aspect                                int64
Slope                                 int64
Horizontal_Distance_To_Hydrology      int64
Vertical_Distance_To_Hydrology        int64
Horizontal_Distance_To_Roadways       int64
Hillshade_9am                         int64
Hillshade_Noon                        int64
Hillshade_3pm                         int64
Horizontal_Distance_To_Fire_Points    int64
Wilderness_Area1                      int64
Wilderness_Area2                      int64
Wilderness_Area3                      int64
Wilderness_Area4                      int64
Soil_Type1                            int64
Soil_Type2                            int64
Soil_Type3                            int64
Soil_Type4                            int64
Soil_Type5                            int64
Soil_Type6                            int64
Soil_Type7                            int64
Soil_Type8                            int64
Soil_Type9                      

In [10]:
X = covdf.iloc[:,:-1]
y = covdf['Cover_Type']

In [17]:
y = y-1
y.unique()

array([4, 1, 0, 6, 2, 5, 3], dtype=int64)

In [18]:
from sklearn.model_selection import train_test_split as tts 
covx_train, covx_test, covy_train, covy_test = tts(X,y,test_size=0.2)

In [10]:
simple_stacker(covx_train, covy_train, covx_test, covy_test)

93.772


In [19]:
train_preds = multistack(covx_train, covy_train, 0)
print(train_preds.shape)
print(covy_train.shape)
print("Accuracy is: ", round(accuracy_score(covy_train, train_preds)*100,2))

In [None]:
test_preds = multistack(covx_test, covy_test, 1)
print(test_preds.shape)
print(covy_test.shape)
print("Accuracy is: ", round(accuracy_score(covy_train, train_preds)*100,2))

## Dataset 3 Mobile