In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingClassifier 
import xgboost as xgb

### First Let's load our data.
Starting with the smallest dataset for simplicity

In [52]:
x = np.memmap('data/binary_data_20', mode='r', shape = (150000,20))
# x = np.load('data/gray_data_20')
y = np.memmap('data/image_target.npy', mode='r', shape = (150000,))
x_test = np.memmap('data/test_binary_data_20.npy', mode='r', shape = (50000,20))
y_test = np.memmap('data/test_target.npy', mode='r', shape = (50000))
print(x.shape)
print(y.shape)
x = pd.DataFrame(x)
y = pd.DataFrame(y)
# y = y == 2
# y = y.astype(int)
# y_test = y_test == 2
# y_test = y_test.astype(int)
x = x/255
x_test = x_test/255

(150000, 20)
(150000,)


"target_data": {
"Inco": 2, 
"Teac": 1, 
"Cons": 0, 
"Publ": 4, 
"Econ": 3}}

In [24]:
X_train, X_val, y_train, y_val = train_test_split(x, y[0], test_size=0.2, stratify=y)
print(X_train.shape)
print(y_val.shape)

(120000, 20)
(30000,)


## KNN
K nearest neighbors

~~~
Accuracy:  0.218
Precision:	0.21878291972161956
Recall:	0.21799999999999997
F1:	0.2158672632176471
array([[1658, 1364, 1110,  993,  875],
       [1541, 1545, 1138,  965,  811],
       [1557, 1340, 1286,  961,  856],
       [1567, 1341, 1192, 1059,  841],
       [1590, 1370, 1090,  958,  992]])
~~~


In [54]:
knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
knn.fit(X_train, y_train)
y_pred = knn.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.1996
Precision:	0.19931270319214384
Recall:	0.1996
F1:	0.1965280228228304


array([[2776, 2239, 1888, 1696, 1401],
       [2754, 2291, 1925, 1636, 1394],
       [2748, 2378, 1924, 1596, 1354],
       [2747, 2320, 1919, 1632, 1382],
       [2700, 2357, 1985, 1601, 1357]])

## Logistic Regression

~~~
Accuracy:	0.20534
Precision:	0.20556761017314962
Recall:	0.20534
F1:	0.20329858294811992
array([[1883, 2415, 2303, 2030, 1369],
       [1716, 2481, 2369, 2067, 1367],
       [1721, 2372, 2430, 2124, 1353],
       [1661, 2522, 2348, 2085, 1384],
       [1718, 2400, 2401, 2093, 1388]])
~~~


In [53]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.20528
Precision:	0.20550506520053274
Recall:	0.20528
F1:	0.20321277846670044


array([[1880, 2417, 2309, 2028, 1366],
       [1714, 2484, 2373, 2065, 1364],
       [1723, 2372, 2433, 2122, 1350],
       [1657, 2527, 2352, 2082, 1382],
       [1718, 2402, 2407, 2088, 1385]])

In [62]:
grid={"C":np.logspace(-3,3,7), "penalty":["l2"], "solver":['liblinear','newton-cg']}
lr=LogisticRegression()
lr_cv=GridSearchCV(lr,grid,cv=3,scoring='f1_macro')
lr_cv.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l2'],
                         'solver': ['liblinear', 'newton-cg']},
             scoring='f1_macro')

In [63]:
res = pd.DataFrame(lr_cv.cv_results_)
res.sort_values(by='rank_test_score', ascending=True).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
7,1.112077,0.023904,0.020377,0.002034,1.0,l2,newton-cg,"{'C': 1.0, 'penalty': 'l2', 'solver': 'newton-...",0.211422,0.210367,0.211398,0.211063,0.000492,1
9,1.244832,0.12889,0.022958,0.001139,10.0,l2,newton-cg,"{'C': 10.0, 'penalty': 'l2', 'solver': 'newton...",0.211422,0.210367,0.211375,0.211055,0.000487,2
11,1.018942,0.099774,0.019082,0.001488,100.0,l2,newton-cg,"{'C': 100.0, 'penalty': 'l2', 'solver': 'newto...",0.211422,0.210367,0.211375,0.211055,0.000487,2
13,1.055137,0.02063,0.020475,0.001419,1000.0,l2,newton-cg,"{'C': 1000.0, 'penalty': 'l2', 'solver': 'newt...",0.211422,0.210367,0.211375,0.211055,0.000487,2
4,1.603447,0.067098,0.019443,0.001734,0.1,l2,liblinear,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.211364,0.210387,0.211387,0.211046,0.000466,5


In [65]:
lr = LogisticRegression(C=1,penalty='l2',solver='newton-cg')
lr.fit(X_train, y_train)
y_pred = lr.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.20534
Precision:	0.20556761017314962
Recall:	0.20534
F1:	0.20329858294811992


array([[1883, 2415, 2303, 2030, 1369],
       [1716, 2481, 2369, 2067, 1367],
       [1721, 2372, 2430, 2124, 1353],
       [1661, 2522, 2348, 2085, 1384],
       [1718, 2400, 2401, 2093, 1388]])

## Decision Tree

~~~
Accuracy:	0.20046
Precision:	0.20043461791314954
Recall:	0.20046
F1:	0.20025358642806665
array([[2236, 2066, 2003, 1891, 1804],
       [2183, 2051, 1982, 1969, 1815],
       [2319, 1968, 1953, 1921, 1839],
       [2110, 2065, 2062, 1937, 1826],
       [2149, 2070, 2005, 1930, 1846]])
~~~


In [68]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.19902
Precision:	0.199024020898689
Recall:	0.19902000000000003
F1:	0.19901975195168667


array([[1976, 2006, 1985, 2014, 2019],
       [1995, 1974, 2008, 2058, 1965],
       [1966, 2011, 2043, 1990, 1990],
       [1998, 2006, 1917, 1988, 2091],
       [1985, 1930, 2068, 2047, 1970]])

In [74]:
grid={"criterion":["gini",'entropy'],'min_samples_split':[2,6,14,32,64], 'random_state':[42, None] }
dt=DecisionTreeClassifier()
dt_cv=GridSearchCV(dt,grid,cv=3,scoring='f1_macro')
dt_cv.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'min_samples_split': [2, 6, 14, 32, 64],
                         'random_state': [42, None]},
             scoring='f1_macro')

In [75]:
res = pd.DataFrame(dt_cv.cv_results_)
res.sort_values(by='rank_test_score', ascending=True).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_min_samples_split,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,2.550681,0.081907,0.029533,0.002559,gini,2,42.0,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.214716,0.216214,0.217456,0.216128,0.00112,1
1,2.721425,0.208265,0.032089,0.003546,gini,2,,"{'criterion': 'gini', 'min_samples_split': 2, ...",0.217047,0.21396,0.217119,0.216042,0.001472,2
12,3.708585,0.046757,0.031218,0.002523,entropy,6,42.0,"{'criterion': 'entropy', 'min_samples_split': ...",0.217093,0.212089,0.218641,0.215941,0.002796,3
15,3.489973,0.109457,0.030265,0.000409,entropy,14,,"{'criterion': 'entropy', 'min_samples_split': ...",0.214798,0.213864,0.21896,0.215874,0.002215,4
14,3.453665,0.101807,0.029545,0.002062,entropy,14,42.0,"{'criterion': 'entropy', 'min_samples_split': ...",0.215472,0.213849,0.218251,0.215857,0.001817,5


In [83]:
dt = DecisionTreeClassifier(criterion='entropy', min_samples_split=64, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.20046
Precision:	0.20043461791314954
Recall:	0.20046
F1:	0.20025358642806665


array([[2236, 2066, 2003, 1891, 1804],
       [2183, 2051, 1982, 1969, 1815],
       [2319, 1968, 1953, 1921, 1839],
       [2110, 2065, 2062, 1937, 1826],
       [2149, 2070, 2005, 1930, 1846]])

## Random Forest

~~~
Accuracy:	0.20516
Precision:	0.2051212463423299
Recall:	0.20515999999999995
F1:	0.20510088356551712
array([[2072, 2056, 2049, 1907, 1916],
       [2066, 2011, 2039, 1894, 1990],
       [2003, 1998, 2069, 1880, 2050],
       [1978, 2028, 2048, 1920, 2026],
       [2046, 1988, 1930, 1850, 2186]])
~~~


In [85]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.20338
Precision:	0.20334535629899847
Recall:	0.20338000000000003
F1:	0.20292564218634665


array([[2301, 2121, 2052, 1786, 1740],
       [2214, 2205, 1956, 1845, 1780],
       [2244, 2110, 2014, 1845, 1787],
       [2292, 2177, 1977, 1793, 1761],
       [2266, 2106, 1987, 1785, 1856]])

In [94]:
rf = RandomForestClassifier(criterion='entropy', min_samples_split=64, random_state=42, max_features=4, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.2027
Precision:	0.20268895335050835
Recall:	0.20270000000000002
F1:	0.202622174130087


array([[2083, 2024, 2019, 1873, 2001],
       [2058, 2068, 1977, 1889, 2008],
       [2100, 2035, 1959, 1889, 2017],
       [2013, 2153, 1972, 1890, 1972],
       [2099, 2041, 1928, 1797, 2135]])

In [96]:
rf = RandomForestClassifier(min_samples_split=32, max_features=4, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.20516
Precision:	0.2051212463423299
Recall:	0.20515999999999995
F1:	0.20510088356551712


array([[2072, 2056, 2049, 1907, 1916],
       [2066, 2011, 2039, 1894, 1990],
       [2003, 1998, 2069, 1880, 2050],
       [1978, 2028, 2048, 1920, 2026],
       [2046, 1988, 1930, 1850, 2186]])

## XGBoost

~~~
Accuracy:	0.2025
Precision:	0.2025738072573832
Recall:	0.20249999999999999
F1:	0.20229736964345638
array([[2208, 2007, 1772, 2025, 1988],
       [2150, 2057, 1745, 2094, 1954],
       [2183, 1989, 1815, 2045, 1968],
       [2207, 2052, 1764, 2063, 1914],
       [2219, 1989, 1775, 2035, 1982]])
~~~


In [119]:
gbm = xgb.XGBClassifier( 
                        n_estimators=30000,
                        max_depth=4,
                        objective='multi:softmax', #new objective
                        learning_rate=.05, 
                        subsample=.8,
                        min_child_weight=3,
                        colsample_bytree=.8,
                        use_label_encoder=False
                       )

eval_set=[(X_train,y_train),(X_val,y_val)]
fit_model = gbm.fit( 
                    X_train, y_train, 
                    eval_set=eval_set,
                    eval_metric='merror',
                    early_stopping_rounds=50,
                    verbose=False
                   )

y_pred = gbm.predict(x_test)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Accuracy:	0.2025
Precision:	0.2025738072573832
Recall:	0.20249999999999999
F1:	0.20229736964345638


array([[2208, 2007, 1772, 2025, 1988],
       [2150, 2057, 1745, 2094, 1954],
       [2183, 1989, 1815, 2045, 1968],
       [2207, 2052, 1764, 2063, 1914],
       [2219, 1989, 1775, 2035, 1982]])

In [121]:
# gbm = xgb.XGBClassifier(use_label_encoder=False)
# # grid={'max_depth': [2, 6, 12],'n_estimators': [50, 100, 200], 'learning_rate':[0.3,.05],
# #         'objective':['multi:softmax', 'multi:softprob'],'colsample_bytree':[0.2,0.5,0.8]}
# grid={'max_depth': [2, 12],'n_estimators': [50, 200],
#         'objective':['multi:softmax'],'colsample_bytree':[0.8]}
# gbm_cv=GridSearchCV(gbm,grid,cv=3,scoring='f1_macro', n_jobs=-1)
# gbm_cv.fit(X_train, y_train)

In [None]:
# res = pd.DataFrame(gbm_cv.cv_results_)
# res.sort_values(by='rank_test_score', ascending=True).head(10)

In [122]:
gbm = xgb.XGBClassifier( 
                        n_estimators=200,
                        max_depth=12,
                        objective='multi:softmax', #new objective
                        learning_rate=.03, 
                        min_child_weight=3,
                        colsample_bytree=.4,
                        use_label_encoder=False
                       )

eval_set=[(X_train,y_train),(X_val,y_val)]
fit_model = gbm.fit( 
                    X_train, y_train, 
                    eval_set=eval_set,
                    eval_metric='merror',
                    early_stopping_rounds=50,
                    verbose=False
                   )

y_pred = gbm.predict(x_test)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Accuracy:	0.20294
Precision:	0.20297739832653744
Recall:	0.20293999999999998
F1:	0.20288853644668597


array([[2018, 2166, 1941, 1913, 1962],
       [2039, 2060, 1925, 1933, 2043],
       [2146, 2073, 1923, 1900, 1958],
       [2052, 2059, 1884, 1959, 2046],
       [2040, 2044, 1828, 1901, 2187]])

## Naive Bayes

~~~
Accuracy:	0.2025
Precision:	0.2025738072573832
Recall:	0.20249999999999999
F1:	0.20229736964345638
array([[2208, 2007, 1772, 2025, 1988],
       [2150, 2057, 1745, 2094, 1954],
       [2183, 1989, 1815, 2045, 1968],
       [2207, 2052, 1764, 2063, 1914],
       [2219, 1989, 1775, 2035, 1982]])
~~~

In [129]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(x_test,)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

Accuracy:	0.2017
Precision:	0.20149455721524698
Recall:	0.2017
F1:	0.20024009019402036


array([[1796, 2323, 2294, 2032, 1555],
       [1713, 2410, 2272, 2075, 1530],
       [1735, 2294, 2333, 2095, 1543],
       [1682, 2480, 2279, 2050, 1509],
       [1747, 2342, 2306, 2109, 1496]])

## Ensemble of Models

~~~
Accuracy:	0.2025
Precision:	0.2025738072573832
Recall:	0.20249999999999999
F1:	0.20229736964345638
array([[2208, 2007, 1772, 2025, 1988],
       [2150, 2057, 1745, 2094, 1954],
       [2183, 1989, 1815, 2045, 1968],
       [2207, 2052, 1764, 2063, 1914],
       [2219, 1989, 1775, 2035, 1982]])
~~~

In [126]:
# create voting classifier
model_list = [
    ('knn', knn),
    ('lr', lr),
    ('dt', dt),
    ('rf', rf),
]
voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='hard', #<-- sklearn calls this hard voting
                                    n_jobs=-1)
voting_classifer.fit(X_train, y_train)
y_pred = voting_classifer.predict(x_test)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)


Accuracy:	0.2022
Precision:	0.20107199737095044
Recall:	0.2022
F1:	0.18465419725423415


array([[4069, 2612, 1520, 1049,  750],
       [4029, 2697, 1487, 1026,  761],
       [3941, 2667, 1585, 1036,  771],
       [3946, 2756, 1537, 1001,  760],
       [3948, 2696, 1595, 1003,  758]])

In [127]:
model_list = [
    ('knn', knn),
    ('lr', lr),
    ('dt', dt),
    ('rf', rf),
]
voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='soft', #<-- sklearn calls this hard voting
                                    n_jobs=-1)
voting_classifer.fit(X_train, y_train)
y_pred = voting_classifer.predict(x_test)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)


Accuracy:	0.20114
Precision:	0.20110788416198888
Recall:	0.20114
F1:	0.20111479120585632


array([[2040, 1987, 1952, 2046, 1975],
       [1991, 2077, 1929, 1959, 2044],
       [2045, 1998, 1923, 2030, 2004],
       [1969, 2066, 1889, 2048, 2028],
       [2005, 2017, 2051, 1958, 1969]])

In [134]:
model_list = [knn,lr,dt,rf]
stacked = StackingClassifier(
    classifiers=model_list, meta_classifier=LogisticRegression(n_jobs=-1), use_probas=False)
stacked.fit(X_train, y_train)
y_pred = stacked.predict(x_test)
print(f'Accuracy:\t{accuracy_score(y_test, y_pred)}')
print(f'Precision:\t{precision_score(y_test, y_pred, average="macro")}')
print(f'Recall:\t{recall_score(y_test, y_pred, average="macro")}')
print(f'F1:\t{f1_score(y_test, y_pred, average="macro")}')
confusion_matrix(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:	0.20006
Precision:	0.20007959484982396
Recall:	0.20006000000000004
F1:	0.20005786370219533


array([[2051, 2070, 1981, 1925, 1973],
       [2049, 2009, 2004, 1948, 1990],
       [2078, 1977, 1980, 1944, 2021],
       [1934, 2076, 1947, 1987, 2056],
       [2047, 2023, 2006, 1948, 1976]])