# Data Mining Assignment 1
## Part 2: Data Analysis
### Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
df_train = pd.read_csv("./data/eclipse-metrics-packages-2.0.csv", sep=";")

# Only select columns with avg, max or sum in the name, corresponding to Table 1 of the paper
X_train = df_train[df_train.columns[df_train.columns.str.contains("avg|max|sum")]]

# Add the column with the number of pre-release bugs
X_train["pre"] = df_train["pre"]

X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train["pre"] = df_train["pre"]


Unnamed: 0,ACD_avg,ACD_max,ACD_sum,FOUT_avg,FOUT_max,FOUT_sum,MLOC_avg,MLOC_max,MLOC_sum,NBD_avg,...,PAR_avg,PAR_max,PAR_sum,TLOC_avg,TLOC_max,TLOC_sum,VG_avg,VG_max,VG_sum,pre
0,0.571429,4.0,4.0,5.980769,29.0,311.0,9.230769,55.0,480.0,1.826923,...,1.173077,4.0,61.0,112.0,277.0,784.0,2.576923,11.0,134.0,5
1,0.0,0.0,0.0,4.0,22.0,168.0,6.666667,32.0,280.0,1.357143,...,1.095238,4.0,46.0,140.0,386.0,420.0,2.333333,15.0,98.0,2
2,1.238095,14.0,26.0,4.321267,33.0,955.0,7.027149,83.0,1553.0,1.452489,...,0.800905,4.0,177.0,116.0,679.0,2436.0,2.045249,19.0,452.0,9
3,0.555556,4.0,5.0,3.752941,80.0,319.0,6.517647,118.0,554.0,1.564706,...,1.517647,7.0,129.0,99.444444,219.0,895.0,2.411765,32.0,205.0,2
4,2.0,5.0,18.0,6.552632,63.0,996.0,10.736842,75.0,1632.0,2.052632,...,0.796053,4.0,121.0,253.444444,724.0,2281.0,2.809211,21.0,427.0,6


In [3]:
Y_train = df_train["post"]
# Transform the labels to binary, 0 if no bug, 1 if bug
Y_train = np.where(Y_train > 0, 1, 0)

print(f"Number of bugs in the training set: {Y_train.sum()} ({Y_train.sum()/len(Y_train)*100:.0f}%)")

Number of bugs in the training set: 190 (50%)


In [4]:
df_test = pd.read_csv("./data/eclipse-metrics-packages-3.0.csv", sep=";")

X_test = df_test[df_test.columns[df_test.columns.str.contains("avg|max|sum")]]
X_test["pre"] = df_test["pre"]

Y_test = df_test["post"]
Y_test = np.where(Y_test > 0, 1, 0)

X_test.shape, Y_test.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["pre"] = df_test["pre"]


((661, 40), (661,))

### Logistic Regression

The metrics as reported by Table 5 of the paper:

| Logistic Regression model | Training | Testing | Defects | Precision | Recall | Accuracy |
|---------------------------|----------|---------|---------|-----------|--------|----------|
| Original                  | 2.0      | 3.0     | 0.474   | 0.786     | 0.588  | 0.729    |


In [5]:
# Logistic Regression model, trained on 2.0, tested on 3.0
model = LogisticRegression()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

# Calculate Precision, Recall and Accuracy
defects = Y_test.sum() / len(Y_test)
precision = precision_score(Y_test, Y_pred)
recall = recall_score(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)

print(f"Defects: {defects:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, Accuracy: {accuracy:.3f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Defects: 0.474, Precision: 0.783, Recall: 0.588, Accuracy: 0.728


### Data analysis

In [6]:
print(f"X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}")
print(f"Number of bugs in the training set: {Y_train.sum()} ({Y_train.sum()/len(Y_train)*100:.0f}%)")
print(f"Number of bugs in the test set: {Y_test.sum()} ({Y_test.sum()/len(Y_test)*100:.0f}%)")

X_train.shape: (377, 40), X_test.shape: (661, 40)
Number of bugs in the training set: 190 (50%)
Number of bugs in the test set: 313 (47%)


In [7]:
# Concat min, avg, max of pre-release and post-release bugs
desc_train = df_train[["pre", "post"]].describe()[["pre", "post"]].T

# Only keep the columns avg, max or sum
desc_train = desc_train[desc_train.columns[desc_train.columns.str.contains("mean|std|min|max")]]

desc_train.round(2)

Unnamed: 0,mean,std,min,max
pre,11.4,21.79,0.0,179.0
post,2.43,6.24,0.0,88.0


In [8]:
# Concat min, avg, max of pre-release and post-release bugs
desc_test = df_test[["pre", "post"]].describe()[["pre", "post"]].T

# Only keep the columns avg, max or sum
desc_test = desc_test[desc_test.columns[desc_test.columns.str.contains("mean|std|min|max")]]

desc_test.round(2)

Unnamed: 0,mean,std,min,max
pre,7.03,15.75,0.0,220.0
post,2.32,5.54,0.0,65.0


In [8]:
from scipy import stats
data_tree = [0.682, 0.685, 0.685, 0.685, 0.685, 0.685, 0.685, 0.682, 0.685, 0.685]
data_bagging = [0.770, 0.772, 0.767, 0.790, 0.785, 0.787, 0.781, 0.772, 0.784, 0.770]
data_rf = [0.767, 0.759, 0.756, 0.747, 0.755, 0.753, 0.756, 0.759, 0.747, 0.756]


result = stats.kruskal(data_tree, data_bagging, data_rf)
print(f"p-value: {result.pvalue:.6f}; statistic: {result.statistic:.3f}")

# Print average accuracy of each model
print(f"Decision Tree average accuracy: {np.mean(data_tree):.3f}; std: {np.std(data_tree):.3f}")
print(f"Bagging average accuracy: {np.mean(data_bagging):.3f}; std: {np.std(data_bagging):.3f}")
print(f"Random Forest average accuracy: {np.mean(data_rf):.3f}; std: {np.std(data_rf):.3f}")

p-value: 0.000002; statistic: 26.227
Decision Tree average accuracy: 0.684; std: 0.001
Bagging average accuracy: 0.778; std: 0.008
Random Forest average accuracy: 0.755; std: 0.006
