# Compute performance metrics for the given Y and Y_score without sklearn

In [3]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages


## A. Compute performance metrics for the given data '5_a.csv'
 <pre>  <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [14]:
df_a=pd.read_csv('5_a.csv')
threshold = 0.5

In [15]:
def Y_pred(df_a,threshold):
    y_pred = []
    for index in range(len(df_a)):
        if  df_a['proba'][index] < threshold:
            y_pred.append('0')
        else:
            y_pred.append('1')
    df_a['y_pred'] = y_pred
    return(df_a)

In [16]:
df_a

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199
...,...,...
10095,1.0,0.665371
10096,1.0,0.607961
10097,1.0,0.777724
10098,1.0,0.846036


In [17]:
def confusion_matix(df_a,threshold):
    df_a = Y_pred(df_a,threshold)
    TN = 0
    FN = 0
    TP = 0
    FP = 0
    for y in range(len(df_a)):
        if((df_a['y'][y]== 0) and (df_a['y_pred'][y]== '0')):
            TN += 1
        elif((df_a['y'][y]== 1) and (df_a['y_pred'][y]=='0')):
            FN += 1
        elif((df_a['y'][y]== 0) and (df_a['y_pred'][y]== '1')):
            FP += 1
        elif((df_a['y'][y]== 1) and (df_a['y_pred'][y]== '1')):
            TP += 1
    return({"confusion_matix":[{"TN":TN,"FN":FN,"FP":FP,"TP":TP}]})

In [18]:
confusion_matix(df_a,threshold)

{'confusion_matix': [{'TN': 0, 'FN': 0, 'FP': 100, 'TP': 10000}]}

In [19]:
def F_1_score(df_a,threshold):
    result = confusion_matix(df_a,threshold)
    precision = result['confusion_matix'][0]['TP']/(result['confusion_matix'][0]['TP']+result['confusion_matix'][0]['FP'])
    value_y = df_a.y.value_counts()
    recall = result['confusion_matix'][0]['TP'] / value_y[1]
    F_1_Score = 2*(precision * recall)/(precision + recall)
    return({"F_1_score":F_1_Score,"precision":precision,"recall":recall})

In [20]:
from tqdm import tqdm
def auc(df_a):
    df_a = df_a.sort_values(by='proba',ascending=False)
    df_a.reset_index(drop=True, inplace=True)
    value_y = df_a.y.value_counts()
    P_ve = value_y[1]
    N_ve = value_y[0]
    TPR = []
    FPR = []
    for index in tqdm(range(len(df_a))):
        threshold = df_a['proba'][index]
        result = confusion_matix(df_a,threshold)
        TPR.append(result['confusion_matix'][0]['TP'] / P_ve)
        FPR.append(result['confusion_matix'][0]['FP'] / N_ve)
        df_a.drop(columns=['y_pred'])   
    AUC = np.trapz(TPR,FPR)
    return({"AUC":AUC})
    

In [21]:
def accuracy_score(df_a,threshold):
    result = confusion_matix(df_a,threshold)
    Accuracy_Score = (result['confusion_matix'][0]['TP']+result['confusion_matix'][0]['TN'])/len(df_a)
    return({"Accuracy_Score":Accuracy_Score})



## B. Compute performance metrics for the given data '5_b.csv'
<pre>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a>
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [None]:
df_b=pd.read_csv('5_b.csv')
threshold = 0.5

In [None]:
confusion_matix(df_b,threshold)

In [None]:
F_1_score(df_b,threshold)

In [None]:
auc(df_b)

In [None]:
accuracy_score(df_b,threshold)

### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [None]:
# write your code here for task A

In [22]:
df_c=pd.read_csv('5_c.csv')
df_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [26]:
df_c = df_c.rename({'prob': 'proba'}, axis=1)

In [37]:
from tqdm import tqdm
def matrix(df_a):
    df_a = df_a.sort_values(by='proba',ascending=False)
    df_a.reset_index(drop=True, inplace=True)
    value_y = df_a.y.value_counts()
    P_ve = value_y[1]
    N_ve = value_y[0]
    MA = {}
    for index in tqdm(range(len(df_a))):
        threshold = df_a['proba'][index]
        result = confusion_matix(df_a,threshold)
        m_value = (500*result['confusion_matix'][0]['FN'])+(100*result['confusion_matix'][0]['FP'])
        MA[threshold] = m_value
    df_a.drop(columns=['y_pred'])   
    return({"Matric":MA})
    

In [38]:
values_min = matrix(df_c)
min_value = min(value_min.values())
for y,x in value_min.items():
    if value == x:
        print(y,x)

100%|██████████████████████████████████████████████████████████████████████████████| 2852/2852 [04:44<00:00, 10.01it/s]

0.2300390278970873 141000





In [None]:
 # write your code for task C


## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [45]:
df_d=pd.read_csv('5_d.csv')
df_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [47]:
data_d['error']=error(data_d,'y','pred')
data_d['abs_error']=absolute_error(data_d,'error')

In [48]:
data_d['error']

0          1.0
1         20.0
2         18.0
3         39.0
4          2.0
          ... 
157195     4.0
157196    11.0
157197    13.0
157198     4.0
157199   -23.0
Name: error, Length: 157200, dtype: float64

In [None]:
 # write your code for task 5d