# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd


## A. Compute performance metrics for the given data '5_a.csv'
 <pre>  <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
df_a=pd.read_csv(r'D:\Datasets\Assignment5\5_a.csv')
df_a.head(5)

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [3]:
def Predict(lst,num=0.5):
    y_pred = []
    for i in lst:
        if i>=num:
            y_pred.append(1)
        else:
            y_pred.append(0)
    return y_pred

def TruePositive(d):
    m1 = d['y']==1.0
    m2 = d['y_pred']==1
    return d[m1&m2].shape[0]

def TrueNegative(d):
    m1 = d['y']==0.0
    m2 = d['y_pred']==0
    return d[m1&m2].shape[0]

def FalsePositive(d):
    m1 = d['y']==0.0
    m2 = d['y_pred']==1
    return d[m1&m2].shape[0]

def FalseNegative(d):
    m1 = d['y']==1.0
    m2 = d['y_pred']==0
    return d[m1&m2].shape[0]

def f1_score(ConfMat):
    recall = ConfMat[0,0]/(ConfMat[0,0]+ConfMat[1,0])
    precision = ConfMat[0,0]/(ConfMat[0,0]+ConfMat[0,1])
    return 2*(precision*recall)/(precision+recall)

def auc(df):
    p = df.iloc[:,1].to_numpy()
    tpr_arr = []
    fpr_arr = []
    for i in p:
        y_pred = Predict(p,i)
        df['y_pred'] = y_pred
        TP = TruePositive(df)
        FN = FalseNegative(df)
        FP = FalsePositive(df)
        TN = TrueNegative(df)
        tpr = TP/(TP+FN)
        fpr = FP/(FP+TN)
        tpr_arr.append(tpr)
        fpr_arr.append(fpr)
    auc_score = np.trapz(tpr_arr,fpr_arr)
    return auc_score
    

In [4]:
a = df_a.iloc[:,1]
a = a.to_numpy()
y_pred = Predict(a)
df_a['y_pred'] = y_pred
df_a.head(5)

Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1


In [5]:
TP = TruePositive(df_a)
FP = FalsePositive(df_a)
TN = TrueNegative(df_a)
FN = FalseNegative(df_a)

In [6]:
# Confusion Matrix
cm1 = np.array([[TP,FP],[FN,TN]])
print('Confusion Matrix:')
print(cm1)
print()
F1_score = f1_score(cm1)
print('F1-Score = ',F1_score)

Confusion Matrix:
[[10000   100]
 [    0     0]]

F1-Score =  0.9950248756218906


In [7]:
sorted_df_a = df_a.sort_values(by='proba', ascending=False)
sorted_df_a.drop(columns='y_pred', inplace=True)
AUC_score = auc(sorted_df_a)
print('AUC score = ',AUC_score)

AUC score =  0.48829900000000004


In [8]:
acc_score = (TP+TN)/(TP+TN+FP+FN)
print('Accuracy score = ',acc_score)

Accuracy score =  0.9900990099009901




## B. Compute performance metrics for the given data '5_b.csv'
<pre>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a>
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [9]:
df_b=pd.read_csv(r'D:\Datasets\Assignment5\5_b.csv')
df_b.head(5)

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [10]:
a = df_b.iloc[:,1]
a = a.to_numpy()
y_pred = Predict(a)
df_b['y_pred'] = y_pred
df_b.head(5)

Unnamed: 0,y,proba,y_pred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0


In [11]:
TP = TruePositive(df_b)
FP = FalsePositive(df_b)
TN = TrueNegative(df_b)
FN = FalseNegative(df_b)

In [12]:
# Confusion Matrix
cm2 = np.array([[TP,FP],[FN,TN]])
print('Confusion Matrix:')
print(cm2)
print()
F1_score = f1_score(cm2)
print('F1-Score = ',F1_score)

Confusion Matrix:
[[  55  239]
 [  45 9761]]

F1-Score =  0.2791878172588833


In [13]:
sorted_df_b = df_b.sort_values(by='proba', ascending=False)
sorted_df_b.drop(columns='y_pred', inplace=True)
AUC_score = auc(sorted_df_b)
print('AUC score = ',AUC_score)

AUC score =  0.9377570000000001


In [14]:
acc_score = (TP+TN)/(TP+TN+FP+FN)
print('Accuracy score = ',acc_score)

Accuracy score =  0.9718811881188119


### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [15]:
def A(df):
    p = df.iloc[:,1].to_numpy()
    lst = []
    for i in p:
        y_pred = Predict(p,i)
        df['y_pred'] = y_pred
        FN = FalseNegative(df)
        FP = FalsePositive(df)
        a = 500*FN + 100*FP
        lst.append(a)
    return lst

In [16]:
df_c = pd.read_csv(r'D:\Datasets\Assignment5\5_c.csv')
sorted_df_c = df_c.sort_values(by='prob', ascending=False)
A_score = A(sorted_df_c)
sorted_df_c['A_score'] = A_score
i = sorted_df_c['A_score'].idxmin()
thresh = sorted_df_c.loc[i,'prob']
print('Best Threshold value = ',thresh)

Best Threshold value =  0.2300390278970873



## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [17]:
df_d=pd.read_csv(r'D:\Datasets\Assignment5\5_d.csv')
df_d.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [18]:
def MSE(y,y_pred):
    n = len(y)
    ss = 0
    for i in range(0,n):
       ss += (y[i]-y_pred[i])**2 
    return ss/n
def MAPE(y,y_pred):
    e = 0
    a = 0
    for i in range(0,len(y)):
        e += np.abs(y[i]-y_pred[i])
        a += y[i]
    return e/a
def RSquared(y,y_pred):
    SS_total = 0
    SS_residue = 0
    mean = np.mean(y)
    for i in range(0,len(y)):
        SS_residue += (y[i]-y_pred[i])**2
        SS_total += (y[i]-mean)**2
    return 1-(SS_residue/SS_total)

In [19]:
y = df_d.iloc[:,0]
y_p = df_d.iloc[:,1]
print('Mean Squared Error = ',MSE(y,y_p))

Mean Squared Error =  177.16569974554707


In [20]:
print('Mean Absolute Percentage Error = ',MAPE(y,y_p))

Mean Absolute Percentage Error =  0.1291202994009687


In [21]:
print('R Squared = ',RSquared(y,y_p))

R Squared =  0.9563582786990964
