# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two we should not import any other packages to calculate performance metrics

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data we can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> we need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, we need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
a5 = pd.read_csv('5_a.csv')
a5.head(10)


Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199
5,1.0,0.6016
6,1.0,0.666323
7,1.0,0.567012
8,1.0,0.65023
9,1.0,0.829346


In [3]:
a5['y_pred'] = a5['proba'].apply(lambda x : 0 if x < 0.5 else 1)

# knowing how the data is distributed
print(a5['y'].value_counts())
print(a5['y_pred'].value_counts())

1.0    10000
0.0      100
Name: y, dtype: int64
1    10100
Name: y_pred, dtype: int64


## Confusion Matrix

In [13]:
def confusion_matrix(df):
    TN = len(df[(df['y'] == 0) & (df['y_pred'] == 0)])
    FN = len(df[(df['y'] == 1) & (df['y_pred'] == 0)])
    FP = len(df[(df['y'] == 0) & (df['y_pred'] == 1)])
    TP = len(df[(df['y'] == 1) & (df['y_pred'] == 1)])
    return TN,TP,FN,FP


TN,TP,FN,FP = confusion_matrix(a5)
print("Confusion Matrix =")
print(np.array([[TN,FN],[FP,TP]]))

Confusion Matrix =
[[    0     0]
 [  100 10000]]


## F1 score

In [7]:
def f1(df):
    TN,TP,FN,FP = confusion_matrix(df)
    precision = TP/(TP+FP)
    recall = TP/(FN+TP)
    return (2*((precision*recall)/(precision+recall)))

print("F1 score =",f1(a5))

F1 score = 0.9950248756218906


## Area Under Curve

In [49]:
from tqdm import tqdm
def auc(df):
    TPR_list = []
    FPR_list = []
    
    #Creating new Dataframe which is sorted by proba values
    sorted_df = df.sort_values(by='proba', ascending = False)
    
    #list of threshold values decending
    thresholds = sorted_df['proba']
    
    #loop to calculate TPR and FPR for each threshold values and returning AUC
    for threshold in tqdm(thresholds):
        sorted_df['y_pred'] = sorted_df['proba'].apply(lambda x : 0 if x < threshold else 1)
        TN,TP,FN,FP = confusion_matrix(sorted_df)
        TPR = TP/(FN+TP)
        FPR = FP/(FP+TN)
        TPR_list.append(TPR)
        FPR_list.append(FPR)
    
    return np.trapz(TPR_list, FPR_list)
print("Area Under Curve =",auc(a5))

100%|███████████████████████████████████████████████████████████████████████████| 10100/10100 [01:08<00:00, 147.63it/s]

Area Under Curve = 0.48829900000000004





## Accuracy 

In [50]:
def accuracy(df):
    TN,TP,FN,FP = confusion_matrix(df)
    return (TN+TP)/(TN+TP+FN+FP)
print("Accuracy = ", accuracy(a5))

Accuracy =  0.9900990099009901


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data we can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> we need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, we need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [18]:
b5 = pd.read_csv('5_b.csv')
b5.head(10)

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648
5,0.0,0.19026
6,0.0,0.320328
7,0.0,0.435013
8,0.0,0.284849
9,0.0,0.427919


In [19]:
b5['y_pred'] = b5['proba'].apply(lambda x : 0 if x < 0.5 else 1)

print(b5['y'].value_counts())
print(b5['y_pred'].value_counts())

0.0    10000
1.0      100
Name: y, dtype: int64
0    9806
1     294
Name: y_pred, dtype: int64


## Confusion Matrix

In [20]:
TN,TP,FN,FP = confusion_matrix(b5)
print("Confusion Matrix =")
print(np.array([[TN,FN],[FP,TP]]))

Confusion Matrix =
[[9761   45]
 [ 239   55]]


## F1 score

In [21]:
print("F1 score = ",f1(b5))

F1 score =  0.2791878172588833


## Area Under Curve

In [22]:
print("Area Under Curve =", auc(b5))

100%|███████████████████████████████████████████████████████████████████████████| 10100/10100 [01:12<00:00, 139.48it/s]

Area Under Curve = 0.9377570000000001





## Accuracy 

In [51]:
accuracy(b5)

0.9718811881188119

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

we will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data we can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [23]:
c5 = pd.read_csv('5_c.csv')
c5.head(10)

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579
5,0,0.595387
6,0,0.370288
7,0,0.299273
8,0,0.297
9,0,0.266479


In [24]:
c5['y_pred'] = c5['prob'].apply(lambda x : 0 if x < 0.5 else 1)

print(c5['y'].value_counts())
print(c5['y_pred'].value_counts())


0    1805
1    1047
Name: y, dtype: int64
0    2099
1     753
Name: y_pred, dtype: int64


## Best Threshold Value 

In [47]:
from tqdm import tqdm
def best_threshold(df):
    
    # Empty dictonary for storing threshold and its metric value
    a_dict = {} 
    
    sorted_df = df.sort_values(by='prob', ascending = False)
    thresholds = set(sorted_df['prob'])
    
    for threshold in tqdm(thresholds):
        sorted_df['y_pred'] = sorted_df['prob'].apply(lambda x : 0 if x < threshold else 1)
        TN,TP,FN,FP = confusion_matrix(sorted_df)
        a_metric = (500*FN) + (100*FP)
        
        # Adding threshold and its metric value to dict
        a_dict[threshold] = a_metric
    
    # min from values
    minimum = min(a_dict.values())
    
    # equivalent threshold of min value
    for k,v in a_dict.items():
        if v == minimum:
            return k
    
print("Best Threshold Value = ", best_threshold(c5))

100%|█████████████████████████████████████████████████████████████████████████████| 2791/2791 [00:12<00:00, 224.23it/s]

Best Threshold Value =  0.2300390278970873





<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> we will be having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [25]:
d5 = pd.read_csv('5_d.csv')
d5.head(10)

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0
5,133.0,153.0
6,148.0,139.0
7,172.0,145.0
8,153.0,162.0
9,162.0,154.0


In [26]:
len(d5)

157200

## Mean Square Error

In [59]:
total = 0
for i in range(len(d5)):
    diff = (d5['y'][i] - d5['pred'][i]) * (d5['y'][i] - d5['pred'][i])
    total = total + diff
mse = total/len(d5)
print("Mean Square Error = ", mse)

Mean Square Error =  177.16569974554707


In [56]:
#Using Numpy, found that it takes very less time to compute
data = np.loadtxt('5_d.csv', delimiter=',', skiprows=1)
y = data[:, 0]
pred = data[:, 1]

n = len(data)
mse = np.sum((y - pred)**2) / len(data)
print("Mean Square Error = ", mse)

Mean Square Error =  177.16569974554707


## MAPE

In [57]:
MAPE = (np.sum(np.absolute(y - pred)) / np.sum(y))*100
print("MAPE = ", MAPE)

MAPE =  12.91202994009687


## R^2 ( Coeffecient of Determination )

In [58]:
y_mean = np.mean(np.absolute(y))
SS_total = np.sum((y - y_mean)**2)
SS_res = np.sum((y - pred)**2)
r2 = 1 - (SS_res / SS_total)
print("Coeffecient of Determination = ",r2)

Coeffecient of Determination =  0.9563582786990937
