In [12]:
import numpy as np
import pandas as pd

## A. Compute performance metrics for the given data '5_a.csv'

**Reading Files**

In [13]:
df_a=pd.read_csv("5_a.csv")
df_b=pd.read_csv("5_b.csv")
df_c=pd.read_csv("5_c.csv")
df_d=pd.read_csv("5_d.csv")

In [42]:
df_a.head(10)

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199
5,1.0,0.6016
6,1.0,0.666323
7,1.0,0.567012
8,1.0,0.65023
9,1.0,0.829346


In [43]:
df_a['y_pred']=df_a['proba'].apply(lambda x: 1 if x>=0.5 else 0)
df_b['y_pred']=df_b['proba'].apply(lambda x:0 if x<=0.5 else 1)

**Confusion matrix**

In [44]:
def confusion_matrix(df):
    tn=len(df[(df['y']==0) & (df['y_pred']==0)])
    tp=len(df[(df['y']==1) & (df['y_pred']==1)])
    fn=len(df[(df['y']==1) & (df['y_pred']==0)])
    fp=len(df[(df['y']==0) & (df['y_pred']==1)])
    
    return tn,tp,fn,fp

confusion_matrix(df_a)       

(0, 10000, 0, 100)

**F1 score**

In [45]:
def f1_score(df):
    tn,tp,fn,fp=confusion_matrix(df)
    precision=tp/(tp+fp)
    recall=tp/(tp+fn)
    f1=2*((precision*recall)/(precision+recall))
    return f1

**Accuracy value**


In [46]:
def accuracy(df):
    tn,tp,fn,fp=confusion_matrix(df)
    acc=((tp+tn)/(tp+fp+fn+tn))
    return acc

**AUC Value**

In [47]:
def auc_score(df):
    tpr=[]
    fpr=[]
    
    df_sort=df.sort_values('proba',ascending=False)
    for i in range(0,len(df_sort)):
        #https://numpy.org/doc/stable/reference/generated/numpy.where.html
        #ith row and [proba] as column
        df_sort['y_pred']=np.where(df_sort['proba']>=df_sort.iloc[i]['proba'],1,0) 
        tn,tp,fn,fp=confusion_matrix(df)
        
        fp_rate=fp/(tn+fp)
        tp_rate=tp/(tp+fn)
        tpr.append(tp_rate)
        fpr.append(fp_rate)
        auc=np.trapz(tpr, fpr) 
    return auc    

## A. Compute performance metrics for the given data '5_a.csv'

 **1.Compute Confusion Matrix**

In [48]:
tn,tp,fn,fp=confusion_matrix(df_a)
print("False Nagative :",fn)
print("False Positive :",fp)
print("True Nagative :",tn)
print("True Positive :",tp)


False Nagative : 0
False Positive : 100
True Nagative : 0
True Positive : 10000


**2.Compute F1 Score**

In [49]:
f1=f1_score(df_a)
print("F1 Score :",f1)

F1 Score : 0.9950248756218906


**3.Compute AUC Score**

In [50]:
auc=auc_score(df_a)
print("AUC Score :",auc)

AUC Score : 0.0


**4.Compute Accuracy Score**

In [51]:
acc=accuracy(df_a)
print("Accuracy Score:",acc)

Accuracy Score: 0.9900990099009901


## B.Compute performance metrics for the given data '5_b.csv'

**1.Compute Confusion Matrix **

In [52]:
tn,tp,fn,fp=confusion_matrix(df_b)
print("False Nagative :",fn)
print("False Positive :",fp)
print("True Nagative :",tn)
print("True Positive :",tp)

False Nagative : 45
False Positive : 239
True Nagative : 9761
True Positive : 55


**2.Compute F1 Score**

In [53]:
f1=f1_score(df_b)
print("F1 Score :",f1)

F1 Score : 0.2791878172588833


**3.Compute AUC Score**

In [54]:
auc=auc_score(df_b)
print("AUC Score :",auc)

AUC Score : 0.0


**4.Compute Accuracy Score**

In [55]:
acc=accuracy(df_b)
print("Accuracy Score:",acc)

Accuracy Score: 0.9718811881188119


## C.Compute the best threshold (similarly to ROC curve computation)<br>of probability which gives lowest values of metric A for the given data

In [56]:
def best_threshold(df):
    uniq_prob=0;
    thres_prob=[]
    A=[]
    
    #sorting data based on probability
    df_sort=df.sort_values("prob",ascending=False)
    for i in range(0,len(df_sort)):
        # checking unique probability
        if uniq_prob==(df_sort.iloc[i]['prob']):
            continue
        uniq_prob=df_sort.iloc[i]['prob']
        thres_prob.append(uniq_prob)
        #Always comparing with last element of thresold-list as it is increasing
        df_sort['y_pred']=np.where(df_sort['prob']>=thres_prob[-1],0,1)
        # calculating confusion matrix for each threshold
        tn,tp,fn,fp=confusion_matrix(df_sort)
        val=500*fn+100*fp
        A.append(val)
        
    idx=A.index(min(A))    
    return thres_prob[idx]    

In [57]:
b=best_threshold(df_c)
print('Best thresold value :',b)

Best thresold value : 0.9577467989277196


## D.Compute performance metrics(for regression) for the given data 5_d.csv

In [46]:
def regression_performance(df):
    n=len(df)
    #calculate ei=y-y^
    df['ei']= df.apply(lambda x: abs(x['y'] - x['pred']), axis=1) #
    #square of ei
    df['mse']= df['ei'].apply(lambda x: x*x) 
    total=df['mse'].sum()
    #final mse
    mse=total/n
    #mape=sum(ei)/sum(actual value y)
    mape=(df['ei'].sum())/(df['y'].sum())
    #simple mean of y
    mean=(df['y'].sum())/n
    #sum(ei^2)
    ssres=df['mse'].sum()
    df['sstotal']= df.apply(lambda x: (x['y'] - mean), axis=1)
    df['sstotal']= df['sstotal'].apply(lambda x: x*x)
    sstotal=df['sstotal'].sum()
    r_squared=1-(ssres/sstotal)
    df.head(10)
    return mse,mape,r_squared   

In [47]:
mse,mape,r_squared=regression_performance(df_d)
print('Mean squared error :',mse)
print('Mean absolute percentage error :',mape*100)
print('R squared :',r_squared)

Mean squared error : 177.16569974554707
Mean absolute percentage error : 12.91202994009687
R squared : 0.9563582786990937
