In [93]:
# Importing just numpy
import numpy as np

In [94]:
# Reading dataset using numpy
data = np.loadtxt('Data/Random_classification.csv', delimiter=',', skiprows=1)

**Mertics Explored:**
- TP
- TN
- FP
- FN
- Accuracy
- Precision
- Recall
- F1_Score
- Specificity
- Confusion Matrix
- False Positive Ratio
- Balanced Accuracy
- Matthews_Correlation_Coefficient

# Some important Definitions for Classification

In [95]:
# True Positive
def True_positive(data):
    """ Number of instances which are correctly marked as 1 """
    true_positive=0
    for i in range(len(data)):
        if (data[i][0]== 1) & (data[i][0]== data[i][1]):
            true_positive= true_positive + 1
    return true_positive

# True Negative
def True_Negative(data):
    """ Number of instances which are correctly marked as 0   """
    true_negative=0
    for i in range(len(data)):
        if (data[i][0]== 0) & (data[i][0]== data[i][1]):
            true_negative= true_negative + 1
    return true_negative

# False Positive
def False_positive(data):
    """ Number of instances which are wrongly marked as 0 """
    false_positive=0
    for i in range(len(data)):
        if (data[i][0]== 1) & (data[i][0]!= data[i][1]):
            false_positive= false_positive + 1
    return false_positive

# True Negative
def False_Negative(data):
    """ Number of instances which are wrongly marked as 1  """
    false_negative=0
    for i in range(len(data)):
        if (data[i][0]== 0) & (data[i][0]!= data[i][1]):
            false_negative= false_negative + 1
    return false_negative

In [96]:
TP = True_positive(data)
TN = True_Negative(data)
FP = False_positive(data)
FN = False_Negative(data)

# Checking the model performance

### Accuracy

In [30]:
# Accuracy 
def Accuracy(TP,FP,TN,FN):
    """ Fornuma= (TP+TN/TP+FP+TN+FN)
    Range: [0,1]
    Measures the number of currect prediction over the total prediction
    Best use: Best used when classes are balanced and the cost of false positives and false negatives is similar
    Not to use: In imbalanced datasets, a high accuracy might simply reflect the majority class and mask poor performance on the minority class.
    )"""
    accuracy= (TP+TN)*100/(TP+FP+TN+FN)
    return accuracy

In [50]:
# Precision
def Precision(TP,FP):
    """ Precison: TP/TP+FP 
    Range: [0,1]
    Percentage of true cases which are predicted as True
    Best use: Important when the cost of false positives is high (e.g., spam detection, medical diagnosis where a false alarm could lead to unnecessary treatments)
    Not to use: High precision does not account for false negatives; a model might only be predicting a very few positives to keep FP low"""
    precision= TP/(TP+FP)
    return precision

In [51]:
# Recall
def Recall(TP, FN):
    """Recall: TP/TP+FN also known as True_Positive_Ratio or Sensitivity
    Range: [0,1]
    Precentage of true prediction which are actually true
    Best use: Critical when the cost of missing a positive instance is high (e.g., disease screening where missing a true case is dangerous)
    Not to use: Focusing solely on recall may increase the number of false positives, which is not ideal in scenarios where false alarms are problematic"""
    recall = TP/(TP+FN)
    return recall

In [52]:
# F1_score
def F1_score(precision= Precision(TP,FP), recall= Recall(TP, FN)):
    """F1 score: 2*(Precision * Recall)/(Precision + Recall)
    Range: [0,1]
    Best use: Useful when you need a balance between precision and recall, particularly in cases of uneven class distribution
    Not to use: Does not take true negatives into account, so it might not reflect overall model performance in certain contexts"""
    f1_score= 2* (precision * recall)/ (precision + recall)
    return f1_score

In [54]:
# Specificity
def Specificity(TN, FP):
    """ 
    Specificity = TN/(TN+FP)
    Precentage of false prediction correctly predicted as false
    Best use: Valuable when the correct identification of negatives is important (e.g., confirming that a healthy patient is truly healthy)
    Not to use: In datasets with a high number of negatives, specificity might appear high even if the model misses many positives
    """
    specificity= TN/(TN+FP)
    return specificity
    

In [56]:
# Confusion_matrix
def Confusion_matrix(TP, FP, TN, FN):
    """ 
    Confusion matrix [TP, FN],
                     [FP, TN] 
    Best use: Useful when you need a balance between precision and recall, particularly in cases of uneven class distribution
    Not to use: Does not take true negatives into account, so it might not reflect overall model performance in certain contexts
    """
    return np.array([[TP, FN],
                     [FP, TN]])

In [58]:
# False Positive Ratio
def False_Positive_Ratio(FP, TN):
    """
    False_Positive_Ratio: FP/ (FP+TN)
    """
    false_positive_ratio= FP/ (FP+TN)
    return false_positive_ratio

In [60]:
# ROC Curve and AUC (Area Under the Curve)
import matplotlib.pyplot as plt
def ROC_curve():
    """
    The ROC curve plots the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings
    """
    # Get the unique thresholds from the predicted scores in descending order.
    thresholds = np.sort(np.unique(ata[:, 1]))[::-1]
    
    

In [69]:
# Balanced Accuracy
def Balanced_accuracy(sensitivity = Recall(TP, FN), specificity= Specificity(TN, FP)):
    """
    Balanced Accuracy: (Sensitivity+Specificity)/2
    Best use: Helps when dealing with imbalanced datasets, ensuring both classes are given equal weight
    Not to use: While it provides a better balance than plain accuracy, it can still mask poor performance on one of the classes if the other is extremely high
    """
    balance_accuracy= (sensitivity+specificity)/2
    return balance_accuracy

In [100]:
# Matthews Correlation Coefficient (MCC)
def Matthews_Correlation_Coefficient(TP,TN,FP,FN):
    """
    MCC= TP×TN−FP×FN/ sqrt((TP+FP)(TP+FN)(TN+FP)(TN+FN))
    Best use: Considered one of the best measures for binary classification, especially on imbalanced datasets, as it takes all four quadrants of the confusion matrix into account
    Not to use: The metric can be harder to interpret intuitively compared to accuracy, precision, or recall
    """
    MCC= ((TP*TN)-(FP*FN))/ np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))

    return MCC