In [6]:
import numpy as np
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import math
import random
import warnings
warnings.filterwarnings('ignore')

In [7]:
def read_data():
    data = []
    with open('transfusion.data') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader) # take the header out
        for row in reader: # each row is a list
            data.append(row)
    data  = np.array(data, dtype = np.int32)
    data=np.take(data,np.random.RandomState(seed=15).permutation(data.shape[0]),axis=0);# Randomizes the data beofre splitting into dependent and independent variables
    X = data[:,:-1]
    y = data[:,-1]
    return X, y

    
X, y = read_data()

In [8]:
records=X.shape[0] # Total number of rows
temp=math.ceil(records/5) # Number of rows in each fold
C = [0.1,1,10,100] # Hyperparameter for regularization
final_f1_value = np.zeros(5) # Array to store the final F1 scores for each test set

In [9]:
for i in range(0,5): #Start of test loop
    if(i!=4): # Splitting data into training and test for the first 4 test sets
        X_test= X[i*temp:(i+1)*temp,] #Test set of the current loop for independent variables
        y_test= y[i*temp:(i+1)*temp] #Test set of the current loop for dependent variable
        X_train_1=X[0:i*temp,]
        X_train_2= X[(i+1)*temp:,]
        X_train= np.concatenate((X_train_1,X_train_2),0)#Outer Training set of the current loop for independent variable
        y_train_1=y[0:i*temp]
        y_train_2= y[(i+1)*temp:]
        y_train= np.concatenate((y_train_1,y_train_2)) #Outer Training set of the curren loop for dependent variable
    else: # Splitting data into training and test for the last test set since last fold might have fewer rows
        X_test=X[i*temp:,]
        X_train= X[0:i*temp,]
        y_test=y[i*temp:,]
        y_train= y[0:i*temp,]
    f1_matrix=np.zeros((4,4)) # 4 x 4 matrix to store the F1 scores of every parameter acroos the 4 validation folds
    for j in range(0,4): # Start of the validation loop
        if(j!=3):
            X_val = X_train[j*temp:(j+1)*temp,]#Validation set of the current loop for independent variables
            X_inner_train_1 = X_train[0:j*temp,]
            X_inner_train_2 = X_train[(j+1)*temp:,]
            X_inner_train= np.concatenate((X_inner_train_1,X_inner_train_2),0) #Inner Training set of the current loop for independent variable
            y_val = y_train[j*temp:(j+1)*temp] #Validation set of the current loop for dependent variables
            y_inner_train_1 = y_train[0:j*temp]
            y_inner_train_2 = y_train[(j+1)*temp:]
            y_inner_train= np.concatenate((y_inner_train_1,y_inner_train_2))#Inner Training set of the current loop for dependent variable
        else:
            X_val=X_train[j*temp:,]
            X_inner_train= X_train[0:j*temp,]
            y_val=y_train[j*temp:,]
            y_inner_train= y_train[0:j*temp,]
        l=0
        for k in C:# Iterate through all parameters
            model = LogisticRegression(C=k)
            model.fit(X_inner_train, y_inner_train)
            y_pred = model.predict(X_val)
            f1_matrix[l][j]=f1_score(y_val, y_pred)# Store the F1 score in the 4 x 4 matrix
            l=l+1
    print("F1 Score of parameters: rows -> parameters , columns -> validation set")
    print(f1_matrix)
    f1_matrix_sum=f1_matrix.sum(axis=1)# Sum(equivalent to comparing average as we divide by 4) the F1 score for the parameters across the validation loop
    max_val= np.amax(f1_matrix_sum) # Maximum value 
    max_ind = np.where(f1_matrix_sum==max_val) # Selecting the index of parameter with max value
    print("Best Parameter value for the test set: ",C[max_ind[0][0]])# Output the best parameter selected for the current test set
    model = LogisticRegression(C=C[max_ind[0][0]])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    final_f1_value[i]= f1_score(y_test, y_pred) # Storing the F1 score using the best paramterer for current test set
    print("F1 score on the test set : ",final_f1_value[i])
    print("\n")

F1 Score of parameters: rows -> parameters , columns -> validation set
[[0.15789474 0.13953488 0.23809524 0.17391304]
 [0.15789474 0.14285714 0.15       0.17391304]
 [0.15789474 0.14285714 0.15       0.17391304]
 [0.15789474 0.14285714 0.15       0.17391304]]
Best Parameter value for the test set:  0.1
F1 score on the test set :  0.30000000000000004


F1 Score of parameters: rows -> parameters , columns -> validation set
[[0.21621622 0.18604651 0.23809524 0.20833333]
 [0.21621622 0.0952381  0.15       0.20833333]
 [0.21621622 0.0952381  0.15       0.20833333]
 [0.21621622 0.0952381  0.15       0.20833333]]
Best Parameter value for the test set:  0.1
F1 score on the test set :  0.2439024390243902


F1 Score of parameters: rows -> parameters , columns -> validation set
[[0.3        0.23255814 0.30434783 0.24      ]
 [0.3        0.23255814 0.27272727 0.20408163]
 [0.3        0.23255814 0.27272727 0.20408163]
 [0.3        0.23255814 0.27272727 0.20408163]]
Best Parameter value for the test

In [10]:
print(final_f1_value)# Print all the F1 scores for 5 test sets
print(np.mean(final_f1_value))# Mean of F1 scores
print(np.std(final_f1_value))# Standard Deviation of F1 scores

[0.3        0.24390244 0.2173913  0.23809524 0.20833333]
0.24154446296015758
0.03200269118667001
