## Import bear necessities

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import matplotlib.pyplot as plt
import seaborn as sns

from pyod.models.cof import COF

In [2]:
def get_excel_columns(n_cols):
    col_list = []
    n = n_cols
    alpha_list = [chr(c) for c in range(ord('A'), ord('Z') + 1)]
    alpha_list.insert(0,'dummy')
    for i in range(1,n+1):
        col_i = ""
        index_list=[]
        p=[]
        while i>26:
            k = i%26
            i = i//26
            if k==0:
                index_list.append(26)
                i = i-1
            else:
                index_list.append(k)
        index_list.append(int(i))
        index_list.reverse()
        for i in index_list:
            p.append(alpha_list[i])
            col_i = "".join(p)
        col_list.append(col_i)
    return col_list

## Importing Datasets

In [3]:
df_shuttle = pd.read_csv('../datasets/shuttle.csv', header = None)
df_shuttle.columns = get_excel_columns(len(df_shuttle.columns))
df_shuttle.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,108.0,3.0,109.0,0.0,72.0,7.0,1.0,36.0,36.0,o
1,81.0,0.0,84.0,0.0,-14.0,-2.0,4.0,100.0,96.0,o
2,81.0,0.0,84.0,0.0,-20.0,16.0,4.0,105.0,102.0,o
3,76.0,-1.0,81.0,0.0,-42.0,-3.0,5.0,125.0,120.0,o
4,105.0,0.0,107.0,2.0,70.0,0.0,1.0,37.0,36.0,o


In [4]:
df_bc = pd.read_csv('../datasets/winconsin_breast_data.csv', header = None)
df_bc.columns = get_excel_columns(len(df_bc.columns))
df_bc.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,V,W,X,Y,Z,AA,AB,AC,AD,AE
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,o
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,o
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,o
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,o
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,o


In [5]:
df_thyroid = pd.read_csv('../datasets/thyroid.csv', header = None)
df_thyroid.columns = get_excel_columns(len(df_thyroid.columns))
df_thyroid.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,M,N,O,P,Q,R,S,T,U,V
0,0.45,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.061,0.006,0.023,0.087,0.026,o
1,0.61,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.029,0.015,0.061,0.096,0.064,o
2,0.16,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.029,0.019,0.058,0.103,0.056,o
3,0.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.114,0.003,0.024,0.061,0.039,o
4,0.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.049,0.003,0.005,0.116,0.004,o


In [6]:
df_requests = pd.read_csv('../datasets/HBOS_requests.csv', header = None)
df_requests.columns = get_excel_columns(len(df_requests.columns))
df_requests.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,U,V,W,X,Y,Z,AA,AB,AC,AD
0,0.0,215.0,45076.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,n
1,0.0,162.0,4528.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,n
2,0.0,236.0,1228.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,n
3,0.0,233.0,2032.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,n
4,0.0,239.0,486.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,n


## Separating features(& standardising them) & targets

In [7]:
X_shuttle = df_shuttle.iloc[:, 0:-1].values.astype(float)
X_shuttle = StandardScaler().fit(X_shuttle).transform(X_shuttle.astype(float))
X_shuttle[0:5]

array([[ 7.22292697e+00,  8.39601067e-02,  2.76780064e+00,
        -7.48433629e-03,  2.09700256e+00,  2.65343969e-02,
        -3.56177703e+00, -5.97012096e-01,  1.97925322e+00],
       [ 4.13837948e+00,  1.61137132e-02, -9.55332010e-02,
        -7.48433629e-03, -3.31100839e+00, -2.41651309e-02,
        -3.28828740e+00,  3.15615103e+00,  5.97772949e+00],
       [ 4.13837948e+00,  1.61137132e-02, -9.55332010e-02,
        -7.48433629e-03, -3.68831148e+00,  7.72339247e-02,
        -3.28828740e+00,  3.44936690e+00,  6.37757712e+00],
       [ 3.56716698e+00, -6.50175136e-03, -4.39133262e-01,
        -7.48433629e-03, -5.07175614e+00, -2.97984118e-02,
        -3.19712419e+00,  4.62223037e+00,  7.57712000e+00],
       [ 6.88019947e+00,  1.61137132e-02,  2.53873393e+00,
         4.61108246e-02,  1.97123487e+00, -1.28985692e-02,
        -3.56177703e+00, -5.38368922e-01,  1.97925322e+00]])

In [8]:
y_shuttle = df_shuttle.iloc[:,-1]
y_shuttle = LabelEncoder().fit_transform(y_shuttle)
y_shuttle

array([1, 1, 1, ..., 0, 0, 0])

In [9]:
X_bc = df_bc.iloc[:, 0:-1].values.astype(float)
X_bc = StandardScaler().fit(X_bc).transform(X_bc.astype(float))
X_bc[0:5]

array([[ 2.94459131, -1.89399111,  3.371523  ,  3.36963993,  1.81249678,
         5.04621217,  5.08769495,  6.05073727,  2.61040516,  2.19541343,
         6.38173199, -0.5300153 ,  7.46062666, 10.2208802 , -0.26269255,
         1.62634574,  0.83574339,  1.02830733,  1.29585726,  0.85369735,
         5.10341829, -1.13070628,  6.03326554,  6.66959237,  1.72441348,
         4.14201805,  3.44009576,  4.48857786,  3.78330533,  2.34529106],
       [ 4.26836286, -0.04130767,  4.14618628,  5.44299467, -0.59621146,
        -0.10861027,  0.75565908,  2.15545499,  0.23459583, -0.90539638,
         2.00085916, -0.82392687,  1.53572553,  4.03886939, -0.64979315,
        -0.52902448, -0.23994536,  0.59573852, -0.93584561, -0.05286392,
         4.93513036, -0.03143574,  4.41810873,  6.37804259, -0.10348328,
        -0.04354059,  0.42000932,  2.58638752,  0.03048366,  0.52029929],
       [ 3.81684388,  0.83113325,  3.92375821,  4.65830964,  1.18276914,
         1.99674192,  3.00092157,  5.07856231,  1

In [10]:
y_bc = df_bc.iloc[:,-1]
y_bc = LabelEncoder().fit_transform(y_bc)
y_bc

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [11]:
X_thyroid = df_thyroid.iloc[:, 0:-1].values.astype(float)
X_thyroid = StandardScaler().fit(X_thyroid).transform(X_thyroid.astype(float))
X_thyroid[0:5]

array([[-0.37381409,  1.50050924, -0.39440532, -0.12595114, -0.11547005,
        -0.1996087 , -0.10680283, -0.12050941, -0.13060943, -0.25875595,
        -0.27309428, -0.11482544, -0.09275961, -0.16159443, -0.01202552,
        -0.22878653,  3.53097966, -1.89580666, -2.49798316, -0.55943924,
        -2.52374039],
       [ 0.47207276, -0.66644042, -0.39440532, -0.12595114, -0.11547005,
         5.00980171, -0.10680283, -0.12050941, -0.13060943, -0.25875595,
        -0.27309428, -0.11482544, -0.09275961, -0.16159443, -0.01202552,
        -0.22878653,  1.56865643, -0.69296527, -1.41857341, -0.08886888,
        -1.44467614],
       [-1.90698403, -0.66644042,  2.53546276, -0.12595114, -0.11547005,
        -0.1996087 , -0.10680283, -0.12050941, -0.13060943, -0.25875595,
        -0.27309428, -0.11482544, -0.09275961,  6.18833213, -0.01202552,
        -0.22878653,  1.56865643, -0.15836909, -1.50378997,  0.27713029,
        -1.67184756],
       [ 1.74090305, -0.66644042, -0.39440532, -0.12595114

In [12]:
y_thyroid = df_thyroid.iloc[:,-1]
y_thyroid = LabelEncoder().fit_transform(y_thyroid)
y_thyroid

array([1, 1, 1, ..., 0, 0, 0])

In [13]:
X_requests = df_requests.iloc[:, 0:-1].values.astype(float)
X_requests = StandardScaler().fit(X_requests).transform(X_requests.astype(float))
X_requests[0:5]

array([[-1.16294469e-02, -4.32973323e-02,  1.72942298e+00,
        -5.10028721e-02,  3.09800674e-01, -2.07349032e-02,
        -1.78267406e-02, -3.59184535e-03, -3.11062430e-03,
        -7.37444027e-02, -8.02055070e-01, -9.31926855e-01,
        -6.56562448e-02, -7.81214827e-02, -3.09239344e-01,
        -3.12640389e-01,  3.13775971e-02, -2.41359319e-02,
        -5.13582335e-01, -1.20586913e+00, -6.04285618e+00,
        -3.00792173e+01, -8.36026309e-02, -4.02140729e-01,
        -6.27757451e-01, -6.25505504e-02, -5.25355892e-02,
        -3.18988541e-01, -3.22972022e-01],
       [-1.16294469e-02, -7.83989779e-02,  1.80933613e-02,
        -5.10028721e-02,  3.09800674e-01, -2.07349032e-02,
        -1.78267406e-02, -3.59184535e-03, -3.11062430e-03,
        -7.37444027e-02, -7.02303161e-01, -8.46679267e-01,
        -6.56562448e-02, -7.81214827e-02, -3.09239344e-01,
        -3.12640389e-01,  3.13775971e-02, -2.41359319e-02,
        -5.13582335e-01, -1.19642600e+00, -6.01813955e+00,
         9.82

In [14]:
y_requests = df_requests.iloc[:,-1]
y_requests = LabelEncoder().fit_transform(y_requests)
y_requests

array([0, 0, 0, ..., 1, 1, 1])

## Splitting datasets into Training and Testing sets

In [15]:
X_train_shuttle, X_test_shuttle, y_train_shuttle, y_test_shuttle = train_test_split(X_shuttle, y_shuttle, test_size = 0.3)
print("Train Data: ", X_train_shuttle.shape, y_train_shuttle.shape)
print("Test Data: ", X_test_shuttle.shape, y_test_shuttle.shape)

Train Data:  (32524, 9) (32524,)
Test Data:  (13940, 9) (13940,)


In [16]:
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(X_bc, y_bc, test_size = 0.3)
print("Train Data: ", X_train_bc.shape, y_train_bc.shape)
print("Test Data: ", X_test_bc.shape, y_test_bc.shape)

Train Data:  (256, 30) (256,)
Test Data:  (111, 30) (111,)


In [17]:
X_train_thyroid, X_test_thyroid, y_train_thyroid, y_test_thyroid = train_test_split(X_thyroid, y_thyroid, test_size = 0.3)
print("Train Data: ", X_train_thyroid.shape, y_train_thyroid.shape)
print("Test Data: ", X_test_thyroid.shape, y_test_thyroid.shape)

Train Data:  (4841, 21) (4841,)
Test Data:  (2075, 21) (2075,)


In [18]:
X_train_requests, X_test_requests, y_train_requests, y_test_requests = train_test_split(X_requests, y_requests, test_size = 0.3)
print("Train Data: ", X_train_requests.shape, y_train_requests.shape)
print("Test Data: ", X_test_requests.shape, y_test_requests.shape)

Train Data:  (434068, 29) (434068,)
Test Data:  (186030, 29) (186030,)


## Applying COF on datasets

In [19]:
cof = COF(n_neighbors = 20)
cof.fit(X_train_shuttle)
y_pred = cof.predict(X_test_shuttle)
y_pred

KeyboardInterrupt: 

In [20]:
cof2 = COF(n_neighbors = 20)
cof2.fit(X_train_bc)
y_pred_2 = cof2.predict(X_test_bc)
y_pred_2

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0])

In [21]:
cof3 = COF(n_neighbors = 20)
cof3.fit(X_train_thyroid)
y_pred_3 = cof3.predict(X_test_thyroid)
y_pred_3

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
cof4 = COF(n_neighbors = 5)
cof4.fit(X_train_requests)
y_pred_4 = cof4.predict(X_test_requests)
y_pred_4

MemoryError: Unable to allocate 1.37 TiB for an array with shape (434068, 434068) and data type float64