# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

## 5_a.csv dataset

In [2]:
dataset = pd.read_csv('5_a.csv')
dataset.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


### Confusion matrix, F1 score and Accuracy

In [3]:
def generate_ypred_class_labels(dataset):
    dataset['ypred'] = dataset['proba'].apply(lambda x: 0 if x < 0.5 else 1)
    return dataset

In [4]:
def generate_various_metrices(dataset):    
    TP = np.sum((dataset['y']==1) & (dataset['ypred']==1))
    TN = np.sum((dataset['y']==0) & (dataset['ypred']==0))
    FP = np.sum((dataset['y']==0) & (dataset['ypred']==1))
    FN = np.sum((dataset['y']==1) & (dataset['ypred']==0))
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    print('***** Confustion matrix of 5_a.csv *****')
    print('TP is:',TP)
    print('TN is:',TN)
    print('FP is:',FP)
    print('FN is:',FN)
    print('***** F1 score of 5_a.csv *****')
    print('F1 score:', (2*precision*recall)/(precision+recall))
    print('***** Accuracy score of 5_a.csv *****')
    print('Accuracy is:',accuracy)

In [5]:
generate_ypred_class_labels(dataset)

Unnamed: 0,y,proba,ypred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1
...,...,...,...
10095,1.0,0.665371,1
10096,1.0,0.607961,1
10097,1.0,0.777724,1
10098,1.0,0.846036,1


In [6]:
generate_various_metrices(dataset)

***** Confustion matrix of 5_a.csv *****
TP is: 10000
TN is: 0
FP is: 100
FN is: 0
***** F1 score of 5_a.csv *****
F1 score: 0.9950248756218906
***** Accuracy score of 5_a.csv *****
Accuracy is: 0.9900990099009901


### AUC 

In [7]:
uniquevalues = dataset.proba.unique()
uniquevalues.sort()
print(uniquevalues)

[0.50001859 0.50004734 0.50005801 ... 0.89982485 0.89982831 0.89996535]


In [8]:
def generate_tpr_fpr_auc(dataset, feature):

    Truepositive = []
    Falsepositive = []
    Metrics = {}

    for i in uniquevalues:
        dataset['ypred'] = np.where(dataset[feature] >= i, 1, 0)
        TP = np.sum((dataset['y']==1) & (dataset['ypred']==1))
        FP = np.sum((dataset['y']==0) & (dataset['ypred']==1))
        TN = np.sum((dataset['y']==0) & (dataset['ypred']==0))
        FN = np.sum((dataset['y']==1) & (dataset['ypred']==0))
        TPR = TP/(TP+FN)
        FPR = FP/(FP+TN)
        Truepositive.append(TPR)
        Falsepositive.append(FPR)
        Metrics[i] = ((500 * FN) + (100 * FP))
        #Metrics.append((500 * FN) + (100 * FP))
    
    return Falsepositive, Truepositive, Metrics

In [9]:
Falsepositive, Truepositive, Metrics = generate_tpr_fpr_auc(dataset, 'proba')

In [10]:
print('***** AUC score is:',np.trapz(Falsepositive,Truepositive),'*****')

***** AUC score is: -0.511701 *****


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

## 5_b.csv dataset

In [11]:
dataset = pd.read_csv('5_b.csv')
dataset.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [12]:
generate_ypred_class_labels(dataset)

Unnamed: 0,y,proba,ypred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0
...,...,...,...
10095,0.0,0.474401,0
10096,0.0,0.128403,0
10097,0.0,0.499331,0
10098,0.0,0.157616,0


### Confusion matrix, F1 score and Accuracy

In [13]:
generate_various_metrices(dataset)

***** Confustion matrix of 5_a.csv *****
TP is: 55
TN is: 9761
FP is: 239
FN is: 45
***** F1 score of 5_a.csv *****
F1 score: 0.2791878172588833
***** Accuracy score of 5_a.csv *****
Accuracy is: 0.9718811881188119


In [14]:
uniquevalues = dataset.proba.unique()
uniquevalues.sort()
print(uniquevalues)

[0.10000141 0.1001608  0.10016508 ... 0.59219787 0.5948084  0.59529418]


### AUC score

In [15]:
Falsepositive, Truepositive, Metrics = generate_tpr_fpr_auc(dataset,'proba')

In [16]:
print('***** AUC score is:',np.trapz(Falsepositive,Truepositive),'*****')

***** AUC score is: -0.06224300000000001 *****


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

## 5_c.csv dataset

In [17]:
dataset = pd.read_csv('5_c.csv')
dataset.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [18]:
uniquevalues = dataset.prob.unique()
uniquevalues.sort()
print(uniquevalues)
Falsepositive, Truepositive, Metrics = generate_tpr_fpr_auc(dataset, 'prob')

[0.02803799 0.02839574 0.02896366 ... 0.94863779 0.95143692 0.9577468 ]


In [19]:
min_metric = min(Metrics.values())
Metrics
for i in Metrics:
    if Metrics[i] == min_metric:
        print('Best threshold is found at the prob {} and the value is {}'.format(i,min_metric))

Best threshold is found at the prob 0.2300390278970873 and the value is 141000


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [20]:
dataset = pd.read_csv('5_d.csv')
dataset.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [21]:
def performance_metrics_of_regression(dataset):
    
    dataset['error'] = dataset['y'] - dataset['pred']
    dataset['abs_error'] = abs(dataset['error']/dataset['y'])
    MSE = np.mean(dataset['error'] * dataset['error'])
    MAPE = np.mean(dataset['abs_error'] / dataset['y'])
    Sum_residual = np.sum(dataset['error'] * dataset['error'])
    Mean_observed_data = np.mean(dataset['y'])
    Sum_total = np.sum((dataset['y'] - Mean_observed_data) * (dataset['y'] - Mean_observed_data)) 
    R2_score = 1 - (Sum_residual/Sum_total)

    return MSE, MAPE, R2_score

In [22]:
MSE, MAPE, R2_score = performance_metrics_of_regression(dataset)

In [23]:
print('*****Performance metrics of 5_c.csv*****')
print('MSE is:', MSE)
print('MAPE is:', MAPE)
print('R2_score:', R2_score)

*****Performance metrics of 5_c.csv*****
MSE is: 177.16569974554707
MAPE is: inf
R2_score: 0.9563582786990937
