In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import xgboost as xgb
from sklearn.metrics import accuracy_score

from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('df2.csv')
df.columns

Index(['Unnamed: 0', 'Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1',
       'Color2', 'Color3', 'Single_Colored', 'Double_Colored',
       'Triple_Colored', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Group', 'Fee', 'State', 'VideoAmt',
       'Length_Description', 'PhotoAmt', 'Meaningless_Name', 'Has_Name',
       'RescuerID_count', 'state_gdp', 'state_population', 'AdoptionSpeed'],
      dtype='object')

In [3]:
df['Gender'] = pd.Categorical(df.Gender)
df['Vaccinated'] = pd.Categorical(df.Vaccinated)
df['Dewormed'] = pd.Categorical(df.Dewormed)
df['Sterilized'] = pd.Categorical(df.Sterilized)
df['Health'] = pd.Categorical(df.Health)
df['Type'] = pd.Categorical(df.Type)

In [4]:
def binarize_categories(data):
    for column in data.select_dtypes('category'):
        if len(data[column].unique()) == 2:
            data[column] = pd.get_dummies(data[column], dtype='int64')
        else:
            data = pd.get_dummies(data, prefix=column, columns=[column], drop_first=True)
    return data
#binarizing columns
df = binarize_categories(df)

In [5]:
df.columns

Index(['Unnamed: 0', 'Type', 'Age', 'Breed1', 'Breed2', 'Color1', 'Color2',
       'Color3', 'Single_Colored', 'Double_Colored', 'Triple_Colored',
       'MaturitySize', 'FurLength', 'Quantity', 'Group', 'Fee', 'State',
       'VideoAmt', 'Length_Description', 'PhotoAmt', 'Meaningless_Name',
       'Has_Name', 'RescuerID_count', 'state_gdp', 'state_population',
       'AdoptionSpeed', 'Gender_2', 'Gender_3', 'Vaccinated_2', 'Vaccinated_3',
       'Dewormed_2', 'Dewormed_3', 'Sterilized_2', 'Sterilized_3', 'Health_2',
       'Health_3'],
      dtype='object')

In [6]:
df = df[['Type', 'Age', 'Breed1', 'Breed2', 'Color1', 'Color2',
       'Color3', 'Single_Colored', 'Double_Colored', 'Triple_Colored',
       'MaturitySize', 'FurLength', 'Quantity', 'Group', 'Fee', 'State',
       'VideoAmt', 'Length_Description', 'PhotoAmt', 'Meaningless_Name',
       'Has_Name', 'RescuerID_count', 'state_gdp', 'state_population',
       'Gender_2', 'Gender_3', 'Vaccinated_2', 'Vaccinated_3',
       'Dewormed_2', 'Dewormed_3', 'Sterilized_2', 'Sterilized_3', 'Health_2',
       'Health_3', 'AdoptionSpeed']]

In [7]:
array = df.values
X_df = array[:, 0:-1]
y_df = array[:, -1]

In [8]:
def evaluate_classifiers(X, y, n_splits=4, shuffle=True, random_state=0):

    num_trees=100
    num_features=3
    seed= 0

    features = [('Normal',       X)]

    models = [('LogisticRegression',         LogisticRegression(solver="liblinear")),
              ('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()),
              ('KNeighborsClassifier',       KNeighborsClassifier()),
              ('Naive Bayes',                GaussianNB()),
              ('DecisionTreeClassifier',     DecisionTreeClassifier()),
              ('SupportVectorMachine',       SVC(gamma="scale")),
              ('DecisionTree',               DecisionTreeClassifier(random_state=seed)),
              ('RandomForest',               RandomForestClassifier(n_estimators=num_trees, max_features=num_features, random_state=seed)),
              ('XGBoost',                    xgb.XGBClassifier())]

    kfold = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    results = []
    for model in models:

        for feature in features:
            res = cross_val_score(model[1], feature[1], y, cv=kfold)

            [results.append((model[0], feature[0], model[0]+'_'+feature[0], r)) for r in res]

    results = pd.DataFrame(results, columns=['Model', 'FeatureScaling', 'Combined', 'Result'])
    return results

In [9]:
results = evaluate_classifiers(X_df,y_df)
accuracy = results.groupby(['Combined'],sort=False).mean().reset_index()
accuracy



Unnamed: 0,Combined,Result
0,LogisticRegression_Normal,0.380361
1,LinearDiscriminantAnalysis_Normal,0.376226
2,KNeighborsClassifier_Normal,0.336736
3,Naive Bayes_Normal,0.348742
4,DecisionTreeClassifier_Normal,0.370023
5,SupportVectorMachine_Normal,0.331532
6,DecisionTree_Normal,0.368155
7,RandomForest_Normal,0.444467
8,XGBoost_Normal,0.431792


In [10]:
from sklearn.ensemble import VotingClassifier

resall=pd.DataFrame()
res_w1=pd.DataFrame()

seed=0
num_trees=100
num_features=3

estimators=[]
model1=xgb.XGBClassifier()
estimators.append(("xgb", model1))

model2=RandomForestClassifier(n_estimators=num_trees, max_features=num_features, random_state=seed)
estimators.append(("rf", model2))

model=VotingClassifier(estimators)

kfold=KFold(n_splits=10, random_state=seed)

results=cross_val_score(model, X_df, y_df, cv=kfold)

print(f'Voting Ensemble (xgb,rf) - Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}')

res_w1["Res"]=results
res_w1["Type"]="Voting"

resall=pd.concat([resall,res_w1], ignore_index=True)

Voting Ensemble (xgb,rf) - Accuracy 43.980% std 1.526681


In [11]:
feat_imp = pd.DataFrame(columns = ['name', 'xgb_importance','rf_importance'])

#xgb and rf models
model_rf=RandomForestClassifier(n_estimators=num_trees, max_features=num_features, random_state=seed)
model_rf.fit(X_df,y_df)
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_df,y_df)

#calculating xgb and rf feature importance then adding them to total column
for name, importance, importances in zip(df.columns, xgb_model.feature_importances_, model_rf.feature_importances_):
    feat_imp = feat_imp.append({'name':name, 'xgb_importance':importance, 'rf_importance':importances}, ignore_index = True)
    feat_imp['total'] = feat_imp['xgb_importance']+ feat_imp['rf_importance']
    feat_imp = feat_imp.sort_values('total',ascending=False)
    
feat_imp

Unnamed: 0,name,xgb_importance,rf_importance,total
0,RescuerID_count,0.079727,0.099058,0.178785
1,Age,0.079699,0.093425,0.173123
2,Length_Description,0.019004,0.128663,0.147667
3,Sterilized_2,0.121728,0.017631,0.139359
4,PhotoAmt,0.040571,0.090126,0.130698
5,Breed1,0.074952,0.055641,0.130593
6,Type,0.074339,0.01557,0.089909
7,Color1,0.020288,0.053012,0.0733
8,state_population,0.043912,0.022003,0.065915
9,FurLength,0.029747,0.031967,0.061714
