### Imports

In [36]:
import copy
import pymysql
import numpy as np
import pandas as pd

### MySQL Settings

In [2]:
conn = pymysql.connect(host='localhost', port=3306, user='root', password='root', db='homedb')

### Compute semantic bugs co-occurrence 

In [3]:
error_labels = 'E001 E002 E003 E004 E005 E006 E007 E008 E009 E010 E011 E012 E013 E014 E015 E016 E017 E018 E019 E020 E021 E022 E023 E024 E025'.split(' ')

In [14]:
get_bugs = 'SELECT * FROM bugs'
error_codes = np.empty((0, 25), int)

bugs = pd.read_sql(get_bugs, conn)

def get_error_code(error):
    if 'E00' in error:
        return int(error.split('E00')[1])
    return int(error.split('E0')[1])

def count_error_codes(row):
    global error_codes
    error_codes = np.append(error_codes, np.array([np.zeros(25, dtype=int)]), axis=0)
    row_index = len(error_codes) - 1
    errors = row.errors.replace('[', '').replace(']', '')
    errors = errors.split(',')
    for error in errors:
        error = error.replace(',', '').strip()
        col_index = get_error_code(error) - 1
        error_codes[row_index, col_index] = 1

bugs['errors'] = bugs.apply(count_error_codes, axis=1)

In [5]:
# compute co-occurrence matrix
cooccurrence_matrix = np.dot(error_codes.transpose(), error_codes)
# fill diagonal with zeros
np.fill_diagonal(cooccurrence_matrix, 0)
print('\ncooccurrence_matrix:\n{0}'.format(cooccurrence_matrix))

# compute co-occurrence matrix in percentage
cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)
with np.errstate(divide = 'ignore', invalid = 'ignore'):
    cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))
print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage))


cooccurrence_matrix:
[[   48     4     0     0     0     0     0     0     0     1     0     2
      2     0     0     0     0     0     0     0     1     0     0     0
      0]
 [    4  6266    27     7     0     2     0    12     3    95     2   155
     68    38   135    52   257    24   632     2    20     3    22    57
      3]
 [    0    27   399     0     0     0     0     4     0     3     0    10
     11     1     0     0     3     0    40     0     2     0     2     0
      0]
 [    0     7     0   134     0     0     0     2     0     1     0     5
      9     2     0     1     2     0    14     0     1     0     0     1
      0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]
 [    0     2     0     0     0   111     0     0     0     0     0     8
      1     1     0     0     0     1    10     0     0     1     0     0
      0]
 [    0     0     0     0     0     

In [None]:
# compute co-occurrence matrix in percentage
cooccurrence_matrix_percentage = np.empty((0, 25), float)

row_index = 0
for row in cooccurrence_matrix:
    col_index = 0
    cooccurrence_matrix_percentage = np.append(cooccurrence_matrix_percentage, np.array([np.zeros(25, dtype=float)]), axis=0)
    for value in row:
        tmp_sum = cooccurrence_matrix[row_index].sum() + cooccurrence_matrix[col_index].sum()
        cooccurrence_matrix_percentage[row_index, col_index] = value / tmp_sum
        col_index += 1
    row_index += 1
print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage))

In [44]:
tmp = np.empty((0, 3), int)
tmp = np.append(tmp, np.array([[1, 1, 0]]), axis=0)
tmp = np.append(tmp, np.array([[1, 1, 0]]), axis=0)
tmp = np.append(tmp, np.array([[1, 0, 1]]), axis=0)
tmp = np.append(tmp, np.array([[1, 0, 1]]), axis=0)
tmp = np.append(tmp, np.array([[1, 0, 1]]), axis=0)
tmp = np.append(tmp, np.array([[0, 1, 1]]), axis=0)
tmp = np.append(tmp, np.array([[0, 1, 1]]), axis=0)
tmp_co = np.dot(tmp.transpose(), tmp)
np.fill_diagonal(tmp_co, 0)
print('\ncooccurrence_matrix:\n{0}'.format(tmp_co))

_tmp = np.empty((0, 3), float)

row_index = 0
for row in tmp_co:
    col_index = 0
    _tmp = np.append(_tmp, np.array([np.zeros(3, dtype=float)]), axis=0)
    for value in row:
        tmp_sum = tmp_co[row_index].sum() + tmp_co[col_index].sum()
        print(f'value_before: {value}, tmp_sum: {tmp_sum}')
        _tmp[row_index, col_index] = value / tmp_sum
        print(f'value_after: {value / tmp_sum}, actual: {_tmp[row_index, col_index]}')
        col_index += 1
    row_index += 1
    
print('\ncooccurrence_matrix:\n{0}'.format(_tmp))


cooccurrence_matrix:
[[0 2 3]
 [2 0 2]
 [3 2 0]]
value_before: 0, tmp_sum: 10
value_after: 0.0, actual: 0.0
value_before: 2, tmp_sum: 9
value_after: 0.2222222222222222, actual: 0.2222222222222222
value_before: 3, tmp_sum: 10
value_after: 0.3, actual: 0.3
value_before: 2, tmp_sum: 9
value_after: 0.2222222222222222, actual: 0.2222222222222222
value_before: 0, tmp_sum: 8
value_after: 0.0, actual: 0.0
value_before: 2, tmp_sum: 9
value_after: 0.2222222222222222, actual: 0.2222222222222222
value_before: 3, tmp_sum: 10
value_after: 0.3, actual: 0.3
value_before: 2, tmp_sum: 9
value_after: 0.2222222222222222, actual: 0.2222222222222222
value_before: 0, tmp_sum: 10
value_after: 0.0, actual: 0.0

cooccurrence_matrix:
[[0.         0.22222222 0.3       ]
 [0.22222222 0.         0.22222222]
 [0.3        0.22222222 0.        ]]


### Code for generating heatmap for co-occurrence matrix (inspired from stackoverflow)

In [6]:
import numpy as np
import matplotlib.pyplot as plt


def show_values(pc, fmt="%.2f", **kw):
    '''
    Heatmap with text in each cell with matplotlib's pyplot
    Source: http://stackoverflow.com/a/25074150/395857 
    By HYRY
    '''
    pc.update_scalarmappable()
    ax = pc.axes
    for p, color, value in zip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
        x, y = p.vertices[:-2, :].mean(0)
        if np.all(color[:3] > 0.5):
            color = (0.0, 0.0, 0.0)
        else:
            color = (1.0, 1.0, 1.0)
        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)

def cm2inch(*tupl):
    '''
    Specify figure size in centimeter in matplotlib
    Source: http://stackoverflow.com/a/22787457/395857
    By gns-ank
    '''
    inch = 2.54
    if type(tupl[0]) == tuple:
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)

def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels):
    '''
    Inspired by:
    - http://stackoverflow.com/a/16124677/395857 
    - http://stackoverflow.com/a/25074150/395857
    '''

    # Plot it out
    fig, ax = plt.subplots()    
    c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)

    # set tick labels
    #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
    ax.set_xticklabels(xticklabels, minor=False)
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)      

    # Remove last blank column
    plt.xlim( (0, AUC.shape[1]) )

    # Turn off all the ticks
    ax = plt.gca()    
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # Add color bar
    plt.colorbar(c)

    # Add text in each cell 
    show_values(c)

    # Proper orientation (origin at the top left instead of bottom left)
    ax.invert_yaxis()
    ax.xaxis.tick_top()

    # resize 
    fig = plt.gcf()
    fig.set_size_inches(cm2inch(40, 20))


# Using previously computed error_codes and error_labels data
label_headers = error_labels
label_data = error_codes
print('labels:\n{0}'.format(label_data))

# Compute cooccurrence matrix 
cooccurrence_matrix = np.dot(label_data.transpose(),label_data)
print('\ncooccurrence_matrix:\n{0}'.format(cooccurrence_matrix)) 

# Compute cooccurrence matrix in percentage
# FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element
#      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804
cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)
with np.errstate(divide='ignore', invalid='ignore'):
    cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))
print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage))

# Add count in labels
label_header_with_count = [ '{0} ({1})'.format(label_header, cooccurrence_matrix_diagonal[label_number]) for label_number, label_header in enumerate(label_headers)]  
print('\nlabel_header_with_count: {0}'.format(label_header_with_count))

# Plotting
x_axis_size = cooccurrence_matrix_percentage.shape[0]
y_axis_size = cooccurrence_matrix_percentage.shape[1]
title = "Co-occurrence matrix\n"
xlabel= ''#"Labels"
ylabel= ''#"Labels"
xticklabels = label_headers
yticklabels = label_headers
heatmap(cooccurrence_matrix_percentage, title, xlabel, ylabel, xticklabels, yticklabels)
# plt.savefig('image_output.png', dpi=300, format='png', bbox_inches='tight') # use format='svg' or 'pdf' for vectorial pictures
plt.show()

labels:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

cooccurrence_matrix:
[[   48     4     0     0     0     0     0     0     0     1     0     2
      2     0     0     0     0     0     0     0     1     0     0     0
      0]
 [    4  6266    27     7     0     2     0    12     3    95     2   155
     68    38   135    52   257    24   632     2    20     3    22    57
      3]
 [    0    27   399     0     0     0     0     4     0     3     0    10
     11     1     0     0     3     0    40     0     2     0     2     0
      0]
 [    0     7     0   134     0     0     0     2     0     1     0     5
      9     2     0     1     2     0    14     0     1     0     0     1
      0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0]
 [    0     2     0     0     0   111     0     0     0     0    

  1.00000000e+00]]

label_header_with_count: ['E001 (48)', 'E002 (6266)', 'E003 (399)', 'E004 (134)', 'E005 (0)', 'E006 (111)', 'E007 (29)', 'E008 (316)', 'E009 (55)', 'E010 (2335)', 'E011 (34)', 'E012 (5772)', 'E013 (2979)', 'E014 (1623)', 'E015 (399)', 'E016 (102)', 'E017 (458)', 'E018 (206)', 'E019 (10367)', 'E020 (209)', 'E021 (619)', 'E022 (43)', 'E023 (331)', 'E024 (184)', 'E025 (145)']


The tick1On function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use Tick.tick1line.set_visible instead.
The tick2On function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use Tick.tick2line.set_visible instead.
The tick1On function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use Tick.tick1line.set_visible instead.
The tick2On function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use Tick.tick2line.set_visible instead.


<Figure size 1574.8x787.402 with 2 Axes>