# Vector classification using Logistic Regression

## Import libraries

In [1]:
import numpy as np
from sklearn import preprocessing, model_selection
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
import time
import seaborn as sns
from sklearn.linear_model import LogisticRegression

  import pandas.util.testing as tm


In [2]:
def tic():
    #Homemade version of matlab tic and toc functions
    import time
    global startTime_for_tictoc
    startTime_for_tictoc = time.time()

def toc():
    import time
    if 'startTime_for_tictoc' in globals():
        print ("Elapsed time is " + str(time.time() - startTime_for_tictoc) + " seconds.")
        return (time.time() - startTime_for_tictoc)
    else:
        print ("Toc: start time not set")

## Load and prepare data

In [3]:
df = pd.read_csv("s_HI_13scales_normalized.csv", header = None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1,1,1,0.31583,0.20943,0.17625,0.14788,0.12238,0.1111,0.098948,...,0.057811,0.056715,0.053919,0.053366,0.049985,0.054881,0.05024,0.046054,0.04695,0.044561
1,1,2,1,0.30812,0.2044,0.18002,0.14001,0.12228,0.108,0.096978,...,0.054249,0.05426,0.053069,0.050743,0.047353,0.052188,0.04709,0.043959,0.043808,0.040299
2,1,3,1,0.29635,0.19654,0.17522,0.14207,0.12308,0.10462,0.098686,...,0.054432,0.054299,0.050173,0.050105,0.047089,0.05056,0.046734,0.041622,0.042196,0.038283
3,1,4,1,0.29028,0.19594,0.17229,0.14116,0.12218,0.10527,0.095403,...,0.054054,0.055146,0.051311,0.048996,0.0462,0.047931,0.045692,0.040316,0.042458,0.037729
4,1,5,1,0.28365,0.19459,0.17194,0.13973,0.12011,0.10491,0.094665,...,0.053039,0.053249,0.050804,0.04852,0.04455,0.047441,0.043189,0.039803,0.040343,0.036347


In [5]:
df.columns = ["ID", "SCALE", "R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", "R10", "R11", "R12", "R13", "R14", "R15", "R16", "R17", "R18", "R19", "R20",  "R21", "R22", "R23", "R24", "R25", "R26"]

In [6]:
df.head()

Unnamed: 0,ID,SCALE,R1,R2,R3,R4,R5,R6,R7,R8,...,R17,R18,R19,R20,R21,R22,R23,R24,R25,R26
0,1,1,1,0.31583,0.20943,0.17625,0.14788,0.12238,0.1111,0.098948,...,0.057811,0.056715,0.053919,0.053366,0.049985,0.054881,0.05024,0.046054,0.04695,0.044561
1,1,2,1,0.30812,0.2044,0.18002,0.14001,0.12228,0.108,0.096978,...,0.054249,0.05426,0.053069,0.050743,0.047353,0.052188,0.04709,0.043959,0.043808,0.040299
2,1,3,1,0.29635,0.19654,0.17522,0.14207,0.12308,0.10462,0.098686,...,0.054432,0.054299,0.050173,0.050105,0.047089,0.05056,0.046734,0.041622,0.042196,0.038283
3,1,4,1,0.29028,0.19594,0.17229,0.14116,0.12218,0.10527,0.095403,...,0.054054,0.055146,0.051311,0.048996,0.0462,0.047931,0.045692,0.040316,0.042458,0.037729
4,1,5,1,0.28365,0.19459,0.17194,0.13973,0.12011,0.10491,0.094665,...,0.053039,0.053249,0.050804,0.04852,0.04455,0.047441,0.043189,0.039803,0.040343,0.036347


In [7]:
df.tail()

Unnamed: 0,ID,SCALE,R1,R2,R3,R4,R5,R6,R7,R8,...,R17,R18,R19,R20,R21,R22,R23,R24,R25,R26
28075,30,9,1,0.4399,0.29554,0.23412,0.16709,0.1338,0.11299,0.10826,...,0.058722,0.054724,0.052576,0.05118,0.049695,0.047987,0.043126,0.040244,0.041498,0.036793
28076,30,10,1,0.43032,0.28677,0.22401,0.16719,0.12478,0.11639,0.10173,...,0.054631,0.051904,0.051171,0.04669,0.046495,0.043388,0.037132,0.03546,0.033051,0.029259
28077,30,11,1,0.44071,0.29662,0.22525,0.17544,0.11855,0.11613,0.1006,...,0.052226,0.04799,0.044911,0.043624,0.04232,0.042163,0.036941,0.033307,0.030796,0.02565
28078,30,12,1,0.4394,0.2892,0.23686,0.17896,0.12442,0.11862,0.11386,...,0.048043,0.045294,0.041897,0.038104,0.037381,0.037832,0.032798,0.030807,0.029861,0.024496
28079,30,13,1,0.45613,0.29311,0.24234,0.18717,0.12958,0.12006,0.10768,...,0.049819,0.045881,0.040959,0.036512,0.036471,0.034589,0.028139,0.027397,0.026689,0.023868


# Scales filtering

In [8]:
scale=[1,2,3,4,5,6,7,8,9,10,11,12,13]
df=df[df['SCALE'].isin(scale)]
df.head(20)

Unnamed: 0,ID,SCALE,R1,R2,R3,R4,R5,R6,R7,R8,...,R17,R18,R19,R20,R21,R22,R23,R24,R25,R26
0,1,1,1,0.31583,0.20943,0.17625,0.14788,0.12238,0.1111,0.098948,...,0.057811,0.056715,0.053919,0.053366,0.049985,0.054881,0.05024,0.046054,0.04695,0.044561
1,1,2,1,0.30812,0.2044,0.18002,0.14001,0.12228,0.108,0.096978,...,0.054249,0.05426,0.053069,0.050743,0.047353,0.052188,0.04709,0.043959,0.043808,0.040299
2,1,3,1,0.29635,0.19654,0.17522,0.14207,0.12308,0.10462,0.098686,...,0.054432,0.054299,0.050173,0.050105,0.047089,0.05056,0.046734,0.041622,0.042196,0.038283
3,1,4,1,0.29028,0.19594,0.17229,0.14116,0.12218,0.10527,0.095403,...,0.054054,0.055146,0.051311,0.048996,0.0462,0.047931,0.045692,0.040316,0.042458,0.037729
4,1,5,1,0.28365,0.19459,0.17194,0.13973,0.12011,0.10491,0.094665,...,0.053039,0.053249,0.050804,0.04852,0.04455,0.047441,0.043189,0.039803,0.040343,0.036347
5,1,6,1,0.28065,0.1912,0.17078,0.13862,0.12077,0.1067,0.094913,...,0.052538,0.053326,0.049746,0.047611,0.045336,0.047187,0.043181,0.038736,0.039525,0.035651
6,1,7,1,0.27945,0.19185,0.16732,0.14154,0.12069,0.10407,0.094039,...,0.052411,0.05135,0.049588,0.047345,0.044377,0.046587,0.042738,0.03797,0.037899,0.033692
7,1,8,1,0.27827,0.19067,0.16833,0.14191,0.12274,0.10475,0.09457,...,0.052436,0.05103,0.048793,0.047178,0.043379,0.044982,0.040683,0.037142,0.037503,0.032947
8,1,9,1,0.28043,0.19064,0.17071,0.14294,0.12234,0.10661,0.095791,...,0.052372,0.050954,0.048299,0.046549,0.043276,0.04371,0.040016,0.035953,0.035527,0.031849
9,1,10,1,0.28509,0.19145,0.17124,0.14599,0.12478,0.10803,0.096502,...,0.051701,0.050539,0.048318,0.046166,0.042055,0.042789,0.040011,0.034023,0.034156,0.030203


# Feature and target columns

In [9]:
Y = df["ID"]
X = df[["R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", "R10", "R11", "R12", "R13", "R14", "R15", "R16", "R17", "R18", "R19", "R20",  "R21", "R22", "R23", "R24", "R25", "R26"]]

In [10]:
X.head()

Unnamed: 0,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10,...,R17,R18,R19,R20,R21,R22,R23,R24,R25,R26
0,1,0.31583,0.20943,0.17625,0.14788,0.12238,0.1111,0.098948,0.092868,0.087328,...,0.057811,0.056715,0.053919,0.053366,0.049985,0.054881,0.05024,0.046054,0.04695,0.044561
1,1,0.30812,0.2044,0.18002,0.14001,0.12228,0.108,0.096978,0.090908,0.082559,...,0.054249,0.05426,0.053069,0.050743,0.047353,0.052188,0.04709,0.043959,0.043808,0.040299
2,1,0.29635,0.19654,0.17522,0.14207,0.12308,0.10462,0.098686,0.090505,0.084994,...,0.054432,0.054299,0.050173,0.050105,0.047089,0.05056,0.046734,0.041622,0.042196,0.038283
3,1,0.29028,0.19594,0.17229,0.14116,0.12218,0.10527,0.095403,0.090389,0.082678,...,0.054054,0.055146,0.051311,0.048996,0.0462,0.047931,0.045692,0.040316,0.042458,0.037729
4,1,0.28365,0.19459,0.17194,0.13973,0.12011,0.10491,0.094665,0.090668,0.082958,...,0.053039,0.053249,0.050804,0.04852,0.04455,0.047441,0.043189,0.039803,0.040343,0.036347


In [11]:
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: ID, dtype: int64

## Logistic regression classiffier


In [12]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3)

In [13]:
clf = LogisticRegression(C=20.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
clf.fit(X_train, Y_train)

LogisticRegression(C=20.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
accuracy = clf.score(X_test, Y_test)
accuracy

0.9821937321937322

## Test

In [16]:
index_test=8500
sample_measure = np.array(X.iloc[index_test])

In [17]:
sample_measure = sample_measure.reshape(1,-1)

In [18]:
predict = clf.predict(sample_measure)

In [19]:
print("The input has the ID: " + str(predict[0]))

The input has the ID: 10


## Compute mean accuracy

In [20]:
accuracy=[]
for i in range(0, 5):
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3)
    clf = LogisticRegression(C=20.0, class_weight=None, dual=False, fit_intercept=True,
                       intercept_scaling=1, l1_ratio=None, max_iter=10000,
                       multi_class='auto', n_jobs=None, penalty='l2',
                       random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                       warm_start=False)
    clf.fit(X_train, Y_train)
    accuracy.append(clf.score(X_test, Y_test))
    print('Cycle: ' + str(i) + ' | Accuracy: ' + str(clf.score(X_test, Y_test)))

Cycle: 0 | Accuracy: 0.9843304843304843
Cycle: 1 | Accuracy: 0.9811253561253561
Cycle: 2 | Accuracy: 0.9853988603988604
Cycle: 3 | Accuracy: 0.9816001899335233
Cycle: 4 | Accuracy: 0.9801756885090218


In [21]:
#Mean accuracy
print("Mean accuracy is: " + str(round(np.mean(accuracy)*100, 2)) + "%")

Mean accuracy is: 98.25%
