# Imports

In [14]:
import numpy as np
import pandas as pd
from scipy.io import arff

## Reading data

In [15]:
# Change filename here
filename = 'CM1.arff.txt'

data = arff.loadarff(filename)
loaddata = pd.DataFrame(data[0])
loaddata.head()

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,Defective
0,2.0,3.0,0.0,0.0,8.0,4.0,2.0,0.22,2.0,2.0,...,6.0,0.22,5.0,10.0,4.0,7.0,9.0,47.06,9.0,b'N'
1,3.0,3.0,0.0,2.0,2.0,4.0,2.0,0.15,2.0,2.0,...,5.0,0.11,10.0,22.0,5.0,12.0,19.0,26.67,13.0,b'N'
2,38.0,35.0,4.0,5.0,70.0,58.0,18.0,0.17,24.0,2.42,...,51.0,0.08,150.0,222.0,58.0,32.0,218.0,41.9,109.0,b'N'
3,1.0,7.0,5.0,0.0,12.0,12.0,4.0,0.1,6.0,2.0,...,18.0,0.06,50.0,79.0,36.0,19.0,68.0,22.64,41.0,b'Y'
4,9.0,15.0,4.0,14.0,22.0,28.0,8.0,0.2,14.0,2.0,...,24.0,0.11,29.0,64.0,19.0,18.0,73.0,57.14,41.0,b'N'


## Acquire X and Y values

In [16]:
software_metrics = np.array(loaddata[['LOC_BLANK','BRANCH_COUNT','CALL_PAIRS','LOC_CODE_AND_COMMENT']])
labels = np.array(loaddata['Defective'])

## Train test Split

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(software_metrics, labels, test_size = 0.1)
y_train = y_train.astype(str)
y_test = y_test.astype(str)

## Selecting a Model (Logistic Regression)

In [31]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
predictions = logreg.predict(X_test)
predictions

array(['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N'], dtype='<U1')

In [33]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(confusion_matrix(y_test, predictions), columns=['Predicted Non-Defective', "Predicted Defective"], index=['Actual Non-Defective', 'Actual Defective'])

Unnamed: 0,Predicted Non-Defective,Predicted Defective
Actual Non-Defective,28,0
Actual Defective,4,1


In [34]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f'True Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')

True Positives: 1
False Positives: 0
True Negatives: 28
False Negatives: 4


## Calculating Precision (True Positive)/(True Positive + False Positive)

In [35]:
precision = (tp*100)/(tp+fp)
print(f'Precision: {precision}')

Precision: 100.0


## Calculating Recall (True Positve)/(True Positive + False Negative)

In [36]:
recall = (tp*100)/(tp+fn)
print(f'Recall: {recall}')

Recall: 20.0


## Calculating F1-score

In [37]:
f1_score = (2*precision*recall)/(precision+recall)
print(f'F1-score: {f1_score}')

F1-score: 33.333333333333336
