# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd

# A. Compute performance metrics for the given data '5_b.csv'

In [2]:
dataset_a=pd.read_csv('5_a.csv')
dataset_a.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [3]:
k = dataset_a['y'].value_counts()
k

1.0    10000
0.0      100
Name: y, dtype: int64

## Compute confusion metrics & F1 score

In [4]:
#reference :
#https://www.kaggle.com/code/paulrohan2020/performance-metrics-without-sklearn
#https://datascience.stackexchange.com/questions/28493/confusion-matrix-get-items-fp-fn-tp-tn-python
#https://stackoverflow.com/questions/67409178/calculating-tp-fp-tn-fn-values
def calculate_tp_fp_tn_fn(dataset):
    TP , TN, FP, FN = 0 , 0 , 0 ,0
    y_predict_new = []
    for i in dataset['proba']:
        if (i < 0.5):
            y_predict_new.append(0.0)
        else :
            y_predict_new.append(1.0)
            
    dataset['y_predict']=y_predict_new
    
    TN = ((dataset['y']==0.0) & (dataset['y_predict']==0.0)).sum()
    TP = ((dataset['y']==1.0) & (dataset['y_predict']==1.0)).sum()
    FP = ((dataset['y']==0.0) & (dataset['y_predict']==1.0)).sum()
    FN = ((dataset['y']==1.0) & (dataset['y_predict']==0.0)).sum()
    
    return (TP,TN,FN,FP)

In [5]:
def compute_f1_score(TP,TN,FN,FP):
    
    precision = TP/(FP+TP)
    Recall = TP/(TP+FN)
    F1_Score = (2* (precision*Recall)/(precision+Recall))
    
    return (precision,Recall,F1_Score)

def confusion_matrix(TP,TN,FP,FN):
    return (np.array([[TN , FP],[FN,TP]]))

In [6]:
TP,TN,FN,FP = calculate_tp_fp_tn_fn(dataset_a)
precision,Recall,F1_Score = compute_f1_score(TP,TN,FN,FP)
a =confusion_matrix(TP,TN,FP,FN)

print('Confusion_Matrix:',a[0],'\n','\t'*2,a[1])
print('F1_Score:', F1_Score)

Confusion_Matrix: [  0 100] 
 		 [    0 10000]
F1_Score: 0.9950248756218906


## Compute AUC score

In [7]:
#reference : 
# https://stackoverflow.com/questions/65748968/how-to-compute-auc-score-manually-without-using-sklearn   
from tqdm import tqdm

tpr=[]
fpr=[]
dataset = pd.read_csv('5_a.csv')
uniq_data = list(dataset.proba.unique())
uniq_data.sort(reverse = True)


for tow in tqdm(uniq_data):
    y_predict_new = []
    for i in dataset['proba']:
        if (i <  tow):
            y_predict_new.append(0)
        else :
            y_predict_new.append(1)
           
    dataset['y_predict']=y_predict_new
   
    TN = (((dataset['y'])==0)& ((dataset['y_predict'])==0)).sum()
    TP = (((dataset['y'])==1) & ((dataset['y_predict'])==1)).sum()
    FP = (((dataset['y'])==0) & ((dataset['y_predict'])==1)).sum()
    FN = (((dataset['y'])==1) & ((dataset['y_predict'])==0)).sum()
       
    tpr.append(TP/(TP+FN))
    fpr.append(FP/(FP+TN))
x = sorted(tpr)
y = sorted(fpr)
auc = np.trapz(x,y)

100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [03:35<00:00, 46.76it/s]


In [8]:
print('AUC Score :',auc)
print('TP :', TP)
print('TN :', TN)
print('FP :', FP)
print('FN :', FN)

AUC Score : 0.48829900000000004
TP : 10000
TN : 0
FP : 100
FN : 0


## Compute accuracy score

In [9]:
def check_equality(dataset_a):
    p=0
    for i in range(len(dataset_a)):
        
        if dataset_a['y'][i]==dataset_a['y_predict'][i]:
            p+=1
    return p
total_Positive = check_equality(dataset_a)
q = len((dataset_a))
auc_score = total_Positive/q
print('Accuracy Score:', auc_score)

Accuracy Score: 0.9900990099009901


# B. Compute performance metrics for the given data '5_b.csv'

In [10]:
dataset_b=pd.read_csv('5_b.csv')
dataset_b.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [11]:
k = dataset_b['y'].value_counts()
k

0.0    10000
1.0      100
Name: y, dtype: int64

## compute confusion metrics & F1 score

In [12]:
def calculate_tp_fp_tn_fn(dataset2):
    TP , TN, FP, FN = 0 , 0 , 0 ,0
    y_predict_new = []
    for i in dataset2['proba']:
        if (i < 0.5):
            y_predict_new.append(0.0)
        else :
            y_predict_new.append(1.0)
            
    dataset2['y_predict']=y_predict_new
    
    TN = ((dataset2['y']==0.0) & (dataset2['y_predict']==0.0)).sum()
    TP = ((dataset2['y']==1.0) & (dataset2['y_predict']==1.0)).sum()
    FP = ((dataset2['y']==0.0) & (dataset2['y_predict']==1.0)).sum()
    FN = ((dataset2['y']==1.0) & (dataset2['y_predict']==0.0)).sum()
    
    return (TP,TN,FN,FP)

In [13]:
def compute_f1_score(TP,TN,FN,FP):
    
    precision = TP/(FP+TP)
    Recall = TP/(TP+FN)
    F1_Score = (2* (precision*Recall)/(precision+Recall))
    
    return (precision,Recall,F1_Score)

def confusion_matrix(TP,TN,FP,FN):
    return (np.array([[TN , FP],[FN,TP]]))

In [14]:
TP,TN,FN,FP = calculate_tp_fp_tn_fn(dataset_b)
precision,Recall,F1_Score = compute_f1_score(TP,TN,FN,FP)
a =confusion_matrix(TP,TN,FP,FN)

print('Confusion_Matrix:',a[0],'\n','\t'*2,a[1])
print('F1_Score:', F1_Score)

Confusion_Matrix: [9761  239] 
 		 [45 55]
F1_Score: 0.2791878172588833


## compute AUC score

In [15]:
from tqdm import tqdm

tpr=[]
fpr=[]
dataset2 = pd.read_csv('5_b.csv')
uniq_data = list(dataset2.proba.unique())
uniq_data.sort(reverse = True)


for tow in tqdm(uniq_data):
    y_predict_new = []
    for i in dataset2['proba']:
        if (i <  tow):
            y_predict_new.append(0)
        else :
            y_predict_new.append(1)
           
    dataset2['y_predict']=y_predict_new
   
    TN = (((dataset2['y'])==0) & ((dataset2['y_predict'])==0)).sum()
    TP = (((dataset2['y'])==1) & ((dataset2['y_predict'])==1)).sum()
    FP = (((dataset2['y'])==0) & ((dataset2['y_predict'])==1)).sum()
    FN = (((dataset2['y'])==1) & ((dataset2['y_predict'])==0)).sum()
       
    tpr.append(TP/(TP+FN))
    fpr.append(FP/(FP+TN))

x = sorted(tpr)
y = sorted(fpr)
auc = np.trapz(x,y)

100%|████████████████████████████████████████████████████████████████████████████| 10100/10100 [03:32<00:00, 47.44it/s]


In [16]:
print('AUC Score :',auc)
print('TP :', TP)
print('TN :', TN)
print('FP :', FP)
print('FN :', FN)

AUC Score : 0.9377570000000001
TP : 100
TN : 0
FP : 10000
FN : 0


## compute accuracy score

In [17]:
def check_equality(dataset_a):
    p=0
    for i in range(len(dataset_a)):
        
        if dataset_b['y'][i]==dataset_b['y_predict'][i]:
            p+=1
    return p
total_Positive = check_equality(dataset_b)
q = len((dataset_b))
auc_score = total_Positive/q
print('Accuracy Score:', auc_score)

Accuracy Score: 0.9718811881188119


# C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 

In [18]:
dataset_c=pd.read_csv('5_c.csv')
dataset_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [19]:
#reference :
#https://albertuskelvin.github.io/posts/2019/12/best-threshold-maximize-accuracy-from-roc-pr-curve/
dict = {}
sorted_data = dataset_c.sort_values(by='prob', ascending = True)
uniq_prob = list(dataset_c.prob.unique())
uniq_prob.sort(reverse = True)
A = []
for threshold in tqdm(dataset_c['prob']):
   
    y_predict=[]
    for value in dataset_c['prob']:
        if (value <= threshold):
            y_predict.append(0)
            
        else:
            y_predict.append(1)
    dataset_c['y_predict1'] = y_predict
    

    FP = ((dataset_c['y']==0) & (dataset_c['y_predict1']==1)).sum()
    FN = ((dataset_c['y']==1) & (dataset_c['y_predict1']==0)).sum()
    A.append((500 * FN) + (100 * FP))
    
b = pd.Series(A)
min_index = min(b)
threshold_index = A.index(min_index)
print('Threshold Value: ',sorted_data['prob'][threshold_index])
print('Minimum value of A:', min_index)

100%|█████████████████████████████████████████████████████████████████████████████| 2852/2852 [00:13<00:00, 216.76it/s]

Threshold Value:  0.2298716443615991
Minimum value of A: 141000





# D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv

In [20]:
dataset_d=pd.read_csv('5_d.csv')
dataset_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [21]:
#reference :
#https://www.kaggle.com/code/paulrohan2020/performance-metrics-without-sklear
def regression_metrics(dataset4):
    n=len(dataset4)
    dataset4['ei']= dataset4.apply(lambda x: abs(x['y'] - x['pred']), axis=1) 
    dataset4['mse']= dataset4['ei'].apply(lambda x: x*x) 
    total=dataset4['mse'].sum()
    mse=total/n
    mape=(dataset4['ei'].sum())/(dataset4['y'].sum())
    mean=(dataset4['y'].sum())/n
    ssres=dataset4['mse'].sum()
    dataset4['sstotal']= dataset4.apply(lambda x: (x['y'] - mean), axis=1)
    dataset4['sstotal']= dataset4['sstotal'].apply(lambda x: x*x)
    sstotal=dataset4['sstotal'].sum()
    rsquared=1-(ssres/sstotal)
    return mse,mape,rsquared

In [22]:
mse,mape,rsquared=regression_metrics(dataset_d)
print('MEAN SQUARED ERROR :',mse)
print('MEAN ABSOLUTE PERCENTAGE ERROR :',mape*100)
print('R SQUARED :',rsquared)

MEAN SQUARED ERROR : 177.16569974554707
MEAN ABSOLUTE PERCENTAGE ERROR : 12.91202994009687
R SQUARED : 0.9563582786990937
