# Software Defect Detection Notebook

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 150
plt.rcParams['savefig.dpi'] = 150

BASELINE_METRICS_DATASET = 'GHPR_dataset-master/baseline.csv'
REPO_INFO_DATASET = 'GHPR_dataset-master/ghprdata/ghprdata.csv'

REPO_INFO_TITLES = [
    'PROJECT_NAME', 'PROJECT_OWNER', 'PROJECT_DESCRIPTION', 'PROJECT_LABEL',
    'PROJECT_LANGUAGE', 'SHA_FIXED', 'SHA_BUG', 'DIFF_CODE',
    'COMMIT_DESCRIPTION', 'COMMIT_TIME', 'OLD_CONTENT', 'NEW_CONTENT',
    'OLD_PATH', 'NEW_PATH', 'PR_TITLE', 'PR_DESCRIPTION'
]

In [17]:
repo_dataframe = pd.read_csv(REPO_INFO_DATASET, names=REPO_INFO_TITLES)

In [18]:
repo_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3026 entries, 0 to 3025
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   PROJECT_NAME         3026 non-null   object
 1   PROJECT_OWNER        3026 non-null   object
 2   PROJECT_DESCRIPTION  2947 non-null   object
 3   PROJECT_LABEL        2969 non-null   object
 4   PROJECT_LANGUAGE     3026 non-null   object
 5   SHA_FIXED            3026 non-null   object
 6   SHA_BUG              3026 non-null   object
 7   DIFF_CODE            3026 non-null   object
 8   COMMIT_DESCRIPTION   2298 non-null   object
 9   COMMIT_TIME          3026 non-null   int64 
 10  OLD_CONTENT          3026 non-null   object
 11  NEW_CONTENT          3026 non-null   object
 12  OLD_PATH             3026 non-null   object
 13  NEW_PATH             3026 non-null   object
 14  PR_TITLE             3022 non-null   object
 15  PR_DESCRIPTION       2298 non-null   object
dtypes: int

In [19]:
repo_dataframe.head(10)

Unnamed: 0,PROJECT_NAME,PROJECT_OWNER,PROJECT_DESCRIPTION,PROJECT_LABEL,PROJECT_LANGUAGE,SHA_FIXED,SHA_BUG,DIFF_CODE,COMMIT_DESCRIPTION,COMMIT_TIME,OLD_CONTENT,NEW_CONTENT,OLD_PATH,NEW_PATH,PR_TITLE,PR_DESCRIPTION
0,Gadgetbridge,Freeyourgadget,Github mirror of Gadgetbridge - A free and clo...,"activity post processing,activity/health,andro...",Java,000f1ab4780fc9460975791c52597f7c04e15be7,ece0f094eb56219ae5188dcd37b90590a9f4029d,diff --git a/app/src/main/java/nodomain/freeyo...,Fix duplicated DeviceType key introduced by th...,20180902,"/* Copyright (C) 2015-2018 Andreas Shimokawa,...","/* Copyright (C) 2015-2018 Andreas Shimokawa,...",app/src/main/java/nodomain/freeyourgadget/gadg...,app/src/main/java/nodomain/freeyourgadget/gadg...,Roidmi: fix duplicated DeviceType key,Fix duplicated DeviceType key introduced by th...
1,MaterialDateTimePicker,wdullaer,Pick a date or time on Android in style,"bug,duplicate,enhancement,help wanted,invalid,...",Java,0024dbdd6ba3cc7797cc0b1ae537dcdc488c4c27,288f0a9c4b01ffcedf9fec41dd7c0373ee55f277,diff --git a/library/src/main/java/com/wdullae...,No repro steps known but observed in crash rep...,20190209,/*\n * Copyright (C) 2013 The Android Open Sou...,/*\n * Copyright (C) 2013 The Android Open Sou...,library/src/main/java/com/wdullaer/materialdat...,library/src/main/java/com/wdullaer/materialdat...,NPE fix: DayPickerView accessibilityAnnouncePa...,No repro steps known but observed in crash rep...
2,processing,processing,Source code for the Processing Core and Develo...,"android,arm,arm64,book,cantfix,core,critical,d...",Java,005681edd7b222a51d6bdf64cdcba489cd617d1d,1ebf79592c9bedfca838a6e58463470dc1445b84,diff --git a/core/src/processing/core/PApplet....,A couple of Tools in the Base menu are not ren...,20140403,/* -*- mode: java; c-basic-offset: 2; indent-t...,/* -*- mode: java; c-basic-offset: 2; indent-t...,core/src/processing/core/PApplet.java,core/src/processing/core/PApplet.java,"Fix NPE in PDE, affecting color picker and oth...",A couple of Tools in the Base menu are not ren...
3,pentaho-kettle,pentaho,Pentaho Data Integration ( ETL ) a.k.a Kettle,"bug,duplicate,enhancement,invalid,question,won...",Java,005f52621571d1d0d7140e28a1ad5f629bad3bb9,982fbd18142bf5345d50a58f928b8bc7243dc8be,diff --git a/engine/src/org/pentaho/di/trans/s...,Master PR: https://github.com/pentaho/pentaho-...,20170425,/*********************************************...,/*********************************************...,engine/src/org/pentaho/di/trans/steps/textfile...,engine/src/org/pentaho/di/trans/steps/textfile...,[BACKLOG-16118] Fixed issue with field lengths...,Master PR: https://github.com/pentaho/pentaho-...
4,frontend-maven-plugin,eirslett,"""Maven-node-grunt-gulp-npm-node-plugin to end ...","bug,duplicate,enhancement,invalid,question,won...",Java,0081f8f594e3b67879370d712d35a809dd0250b4,b45e73d6cca812f4874013c4eba00f873ca43d36,diff --git a/frontend-plugin-core/src/main/jav...,because File.rename does not support overwrit...,20190312,package com.github.eirslett.maven.plugins.fron...,package com.github.eirslett.maven.plugins.fron...,frontend-plugin-core/src/main/java/com/github/...,frontend-plugin-core/src/main/java/com/github/...,Fix #670: Use Files.move instead of File.renam...,because File.rename does not support overwrit...
5,cas,apereo,Apereo CAS - Enterprise Single Sign On for all...,"AWS Cloud Directory,AWS DynamoDb,Acceptable Us...",Java,00c16dd03ad44c07088def792e7dd4583187caa3,d0024e28889e597c03a059296f35a4360c5b11e6,diff --git a/support/cas-server-support-geoloc...,,20180727,package org.apereo.cas.support.geo.config;\n\n...,package org.apereo.cas.support.geo.config;\n\n...,support/cas-server-support-geolocation-maxmind...,support/cas-server-support-geolocation-maxmind...,Fix possible NPE in configuration when no db p...,
6,qksms,moezbhatti,The most beautiful SMS messenger for Android,"Bug,Discussion,Easy,Enhancement,Feature,Refact...",Java,00c1e07f0575b518af4ce8e383dff92c6cf04582,fb27d745e638fb2c9e5ec930d68bbffee73b27a8,diff --git a/QKSMS/src/main/java/com/mariussof...,The EndlessJabber wakeful service can in some ...,20151006,package com.mariussoft.endlessjabber.sdk;\n\ni...,package com.mariussoft.endlessjabber.sdk;\n\ni...,QKSMS/src/main/java/com/mariussoft/endlessjabb...,QKSMS/src/main/java/com/mariussoft/endlessjabb...,fix potential wakelock,The EndlessJabber wakeful service can in some ...
7,keycloak,keycloak,Open Source Identity and Access Management For...,"Discuss,Hold,Incomplete,Missing Docs,Missing J...",Java,00f6841dc7f8afa785b92bbca6444326d515357e,fbf62b2a6314e6df087d6dafd1446d411ee2ba1f,diff --git a/forms/common-themes/src/main/java...,,20150901,package org.keycloak.theme;\n\nimport org.keyc...,package org.keycloak.theme;\n\nimport org.keyc...,forms/common-themes/src/main/java/org/keycloak...,forms/common-themes/src/main/java/org/keycloak...,Fix loading resources from theme,
8,tutorials,eugenp,"The ""REST With Spring"" Course:","bug,duplicate,enhancement,in progress,invalid,...",Java,0166e8dbecc09b259dc939718bf6da36bd1085f8,63aa10d36f4d920eb6f00f744951674e14e84f69,diff --git a/spring-boot-security/src/main/jav...,,20190406,package com.baeldung.springbootsecurity.oauth2...,package com.baeldung.springbootsecurity.oauth2...,spring-boot-security/src/main/java/com/baeldun...,spring-boot-security/src/main/java/com/baeldun...,fix bean config,
9,android-beacon-library,AltBeacon,Allows Android apps to interact with BLE beacons,"3.0 candidate,bug,duplicate,enhancement,help w...",Java,016bb44b880f16e7a3f08e9d378bc47985850405,4ab3474741c4e51db36bc32639279a4c65ee7d2e,diff --git a/src/main/java/org/altbeacon/beaco...,Using the prior scan end time to calculate whe...,20170410,package org.altbeacon.beacon.service.scanner;\...,package org.altbeacon.beacon.service.scanner;\...,src/main/java/org/altbeacon/beacon/service/sca...,src/main/java/org/altbeacon/beacon/service/sca...,Fix intermittent scans,Using the prior scan end time to calculate whe...


## Column Exploration

### Repository Metadata

In [20]:
repo_dataframe['PROJECT_NAME'].value_counts()


PROJECT_NAME
libgdx                 161
BroadleafCommerce      113
Terasology             108
Mycat-Server            76
MinecraftForge          72
                      ... 
SlidingMenu              1
PageIndicatorView        1
Android-PickerView       1
ffmpeg-android-java      1
jianshi                  1
Name: count, Length: 301, dtype: int64

In [21]:
repo_dataframe['PROJECT_OWNER'].value_counts()


PROJECT_OWNER
libgdx               161
apache               132
BroadleafCommerce    113
MovingBlocks         108
MyCATApache           76
                    ... 
grantland              1
qii                    1
hongyangAndroid        1
yangfuhai              1
wingjay                1
Name: count, Length: 259, dtype: int64

In [22]:
repo_dataframe['PROJECT_LANGUAGE'].value_counts()

PROJECT_LANGUAGE
Java      3012
Kotlin      14
Name: count, dtype: int64

In [23]:
split_labels = repo_dataframe['PROJECT_LABEL'].str.lower().str.split(',')

# Step 2: Explode the column into separate rows
exploded_labels = split_labels.explode()

# Step 3: Count occurrences of each unique value
exploded_labels.value_counts()


PROJECT_LABEL
bug             2305
enhancement     2120
duplicate       1929
wontfix         1861
question        1737
                ... 
v/bug              1
v/cancelled        1
v/fixed            1
v/incomplete       1
abi change         1
Name: count, Length: 2753, dtype: int64

In [24]:
has_file_moved = repo_dataframe['OLD_PATH'] != repo_dataframe['NEW_PATH']

has_file_moved.value_counts()

False    3026
Name: count, dtype: int64

### Static Metric Data

In [25]:
metrics_dataframe = pd.read_csv(BASELINE_METRICS_DATASET)

In [26]:
metrics_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6052 entries, 0 to 6051
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   SHA                   6052 non-null   object
 1   cbo                   6052 non-null   int64 
 2   wmc                   6052 non-null   int64 
 3   dit                   6052 non-null   int64 
 4   rfc                   6052 non-null   int64 
 5   lcom                  6052 non-null   int64 
 6   totalMethods          6052 non-null   int64 
 7   totalFields           6052 non-null   int64 
 8   nosi                  6052 non-null   int64 
 9   loc                   6052 non-null   int64 
 10  returnQty             6052 non-null   int64 
 11  loopQty               6052 non-null   int64 
 12  comparisonsQty        6052 non-null   int64 
 13  tryCatchQty           6052 non-null   int64 
 14  parenthesizedExpsQty  6052 non-null   int64 
 15  stringLiteralsQty     6052 non-null   

In [27]:
metrics_dataframe.head(10)

Unnamed: 0,SHA,cbo,wmc,dit,rfc,lcom,totalMethods,totalFields,nosi,loc,...,tryCatchQty,parenthesizedExpsQty,stringLiteralsQty,numbersQty,assignmentsQty,mathOperationsQty,variablesQty,maxNestedBlocks,uniqueWordsQty,defect
0,7a955fd6c7de2bd912be544dcfe77f9173a7aa600,5,60,2,55,189,27,5,30,247,...,4,2,47,9,27,5,17,3,191,0
1,000f1ab4780fc9460975791c52597f7c04e15be70,3,10,1,1,9,7,4,1,38,...,0,0,0,22,4,0,4,2,69,0
2,000f1ab4780fc9460975791c52597f7c04e15be71,3,10,1,1,9,7,4,0,38,...,0,0,0,22,4,0,4,2,69,1
3,0024dbdd6ba3cc7797cc0b1ae537dcdc488c4c270,20,59,3,63,189,24,9,4,262,...,0,6,6,14,45,8,41,4,222,0
4,0024dbdd6ba3cc7797cc0b1ae537dcdc488c4c271,21,58,2,61,189,24,9,0,260,...,0,6,6,14,45,8,41,4,222,1
5,005681edd7b222a51d6bdf64cdcba489cd617d1d0,42,1656,28,796,199352,758,113,378,8719,...,76,218,279,726,943,390,603,22,2985,0
6,005681edd7b222a51d6bdf64cdcba489cd617d1d1,141,1655,16,682,199352,758,113,0,8717,...,76,218,279,726,943,390,603,22,2985,1
7,005f52621571d1d0d7140e28a1ad5f629bad3bb90,24,223,2,153,6,25,5,4,760,...,19,19,55,59,146,37,71,6,219,0
8,005f52621571d1d0d7140e28a1ad5f629bad3bb91,25,222,1,142,6,25,5,0,759,...,19,19,55,58,145,37,71,6,219,1
9,0081f8f594e3b67879370d712d35a809dd0250b40,10,54,1,49,0,18,8,5,251,...,6,0,64,2,48,16,47,5,120,0


In [29]:
COLUMN_RENAME_MAP = {
    'cbo': 'coupingBtObjects',
    'wmc': 'branchInstrPerClass', #Weight Method Class
    'dit': 'maxInheritanceDepth', #Depth Inheritance Tree
    'rfc': 'uniqueMethodInvCount', #Response for a Class
    'lcom': 'cohesionMetric', #Lack of Cohesion of Methods
    'loc': 'linesOfCode',
    'nosi': 'staticInvocationCount'
}

metrics_dataframe = metrics_dataframe.rename(columns=COLUMN_RENAME_MAP)
metrics_dataframe.columns

Index(['SHA', 'coupingBtObjects', 'branchInstrPerClass', 'maxInheritanceDepth',
       'uniqueMethodInvCount', 'cohesionMetric', 'totalMethods', 'totalFields',
       'staticInvocationCount', 'linesOfCode', 'returnQty', 'loopQty',
       'comparisonsQty', 'tryCatchQty', 'parenthesizedExpsQty',
       'stringLiteralsQty', 'numbersQty', 'assignmentsQty',
       'mathOperationsQty', 'variablesQty', 'maxNestedBlocks',
       'uniqueWordsQty', 'defect'],
      dtype='object')

In [30]:
metrics_dataframe.head(10)

Unnamed: 0,SHA,coupingBtObjects,branchInstrPerClass,maxInheritanceDepth,uniqueMethodInvCount,cohesionMetric,totalMethods,totalFields,staticInvocationCount,linesOfCode,...,tryCatchQty,parenthesizedExpsQty,stringLiteralsQty,numbersQty,assignmentsQty,mathOperationsQty,variablesQty,maxNestedBlocks,uniqueWordsQty,defect
0,7a955fd6c7de2bd912be544dcfe77f9173a7aa600,5,60,2,55,189,27,5,30,247,...,4,2,47,9,27,5,17,3,191,0
1,000f1ab4780fc9460975791c52597f7c04e15be70,3,10,1,1,9,7,4,1,38,...,0,0,0,22,4,0,4,2,69,0
2,000f1ab4780fc9460975791c52597f7c04e15be71,3,10,1,1,9,7,4,0,38,...,0,0,0,22,4,0,4,2,69,1
3,0024dbdd6ba3cc7797cc0b1ae537dcdc488c4c270,20,59,3,63,189,24,9,4,262,...,0,6,6,14,45,8,41,4,222,0
4,0024dbdd6ba3cc7797cc0b1ae537dcdc488c4c271,21,58,2,61,189,24,9,0,260,...,0,6,6,14,45,8,41,4,222,1
5,005681edd7b222a51d6bdf64cdcba489cd617d1d0,42,1656,28,796,199352,758,113,378,8719,...,76,218,279,726,943,390,603,22,2985,0
6,005681edd7b222a51d6bdf64cdcba489cd617d1d1,141,1655,16,682,199352,758,113,0,8717,...,76,218,279,726,943,390,603,22,2985,1
7,005f52621571d1d0d7140e28a1ad5f629bad3bb90,24,223,2,153,6,25,5,4,760,...,19,19,55,59,146,37,71,6,219,0
8,005f52621571d1d0d7140e28a1ad5f629bad3bb91,25,222,1,142,6,25,5,0,759,...,19,19,55,58,145,37,71,6,219,1
9,0081f8f594e3b67879370d712d35a809dd0250b40,10,54,1,49,0,18,8,5,251,...,6,0,64,2,48,16,47,5,120,0


In [32]:
metrics_dataframe.describe()

Unnamed: 0,coupingBtObjects,branchInstrPerClass,maxInheritanceDepth,uniqueMethodInvCount,cohesionMetric,totalMethods,totalFields,staticInvocationCount,linesOfCode,returnQty,...,tryCatchQty,parenthesizedExpsQty,stringLiteralsQty,numbersQty,assignmentsQty,mathOperationsQty,variablesQty,maxNestedBlocks,uniqueWordsQty,defect
count,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,...,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0,6052.0
mean,27.493721,86.177958,4.599141,68.954395,1242.030238,33.497852,14.049405,5.945142,442.07419,25.410112,...,3.218275,5.267845,36.727859,31.120621,65.899042,15.794118,49.70043,5.041143,219.283212,0.5
std,33.214827,136.077153,9.288447,85.771623,9449.009245,53.556999,26.051029,25.263058,681.566084,47.05837,...,8.569417,13.956639,118.995663,85.715467,101.501174,40.011677,72.531059,5.920991,267.134701,0.500041
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
25%,9.0,18.0,1.0,18.0,4.0,8.0,3.0,0.0,93.0,4.0,...,0.0,0.0,3.0,2.0,13.0,1.0,10.0,2.0,74.0,0.0
50%,18.0,45.0,2.0,44.0,43.0,17.0,7.0,0.0,230.0,12.0,...,1.0,1.0,12.0,9.0,34.0,4.0,27.0,4.0,139.0,0.5
75%,34.0,100.0,4.0,89.0,267.0,39.0,18.0,3.0,521.25,28.0,...,3.0,5.0,34.0,29.0,80.0,14.0,62.0,6.0,269.0,1.0
max,419.0,1714.0,285.0,1203.0,199855.0,758.0,903.0,475.0,8832.0,579.0,...,125.0,218.0,4566.0,2856.0,2016.0,501.0,1050.0,77.0,3345.0,1.0


In [33]:
metrics_dataframe['defect'].value_counts()

defect
0    3026
1    3026
Name: count, dtype: int64

In [34]:
metrics_dataframe['maxInheritanceDepth'].value_counts()

maxInheritanceDepth
1      2245
2      1386
3       557
4       371
5       240
       ... 
265       1
88        1
44        1
69        1
51        1
Name: count, Length: 68, dtype: int64