In [7]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import json

# Histogram using different $\alpha$

In [8]:
win00DF = pd.read_csv('data/similarityWin00.txt', sep=';', header=None)
win00DF.columns = ["Ci", "Cj", "Sim"]
win025DF = pd.read_csv('data/similarityWin025.txt', sep=';', header=None)
win025DF.columns = ["Ci", "Cj", "Sim"]
win050DF = pd.read_csv('data/similarityWin050.txt', sep=';', header=None)
win050DF.columns = ["Ci", "Cj", "Sim"]
win075DF = pd.read_csv('data/similarityWin075.txt', sep=';', header=None)
win075DF.columns = ["Ci", "Cj", "Sim"]
win10DF = pd.read_csv('data/similarityWin10.txt', sep=';', header=None)
win10DF.columns = ["Ci", "Cj", "Sim"]

## Histogram

In [9]:
hist1, bins1 = np.histogram(win025DF.Sim,bins=50, density=True)
hist2, bins2 = np.histogram(win050DF.Sim,bins=50, density=True)
hist3, bins3 = np.histogram(win075DF.Sim,bins=50, density=True)

trace1 = go.Scatter(
    x = bins1,
    y = hist1,
    name = 'α=0.25'
)

trace2 = go.Scatter(
    x = bins2,
    y = hist2,
    name = 'α=0.50'
)

trace3 = go.Scatter(
    x = bins3,
    y = hist3,
    name = 'α=0.75'
)





data = [trace1, trace2, trace3]
layout = go.Layout(
    xaxis = dict(title='Similarity'),
    yaxis = dict(title='Number of couple of cases')
)

histSimValues = go.Figure(data = data, layout=layout)


# histSimValues = ff.create_distplot(hist_data, labels, show_rug=False, show_hist=False)
iplot(histSimValues, filename='Histogram-similarity')

## Case base coverage analysis

In this step, we will analyze the case base coverage. To do that, we take account that the recommender algorithm will have an adaptation step. We will use the cases with a positive Elo rating.

In [30]:
simDF = pd.read_csv("data/similarityMatrixWin.txt", sep=";", header=None)
simDF.columns = ["Ci", "Cj", "Sim"]
simDF.head()

Unnamed: 0,Ci,Cj,Sim
0,C0-1,C0-1,1.0
1,C0-1,C0-2,1.0
2,C0-1,C25-6,0.471978
3,C0-1,C1-2,0.440476
4,C0-1,C1-4,0.37381


To calculate the case base coverage, we can assume that a new problem would be solved if there are cases with a minimum similarity threshold. It can be formulated as follows:

\begin{eqnarray}
\label{eq:coverageplus}
coverage'(G) &=&  \frac{1}{|G|} \sum_{c \in G}resolvability(c,G) \\
&where& \notag \\
resolvability(c,G) &=& \left\{ 
   \begin{array}{lcc}
     1 & if & highestSim(c,G) >= \theta \notag \\
     0 & if & highestSim(c,G) < \theta  
   \end{array} \right.
\end{eqnarray}

In [31]:
filterSimDF = simDF[simDF.Ci != simDF.Cj]

sim = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

maxSimilarity = filterSimDF.groupby(by='Ci').max()
maxSimilarity.head()

Unnamed: 0_level_0,Cj,Sim
Ci,Unnamed: 1_level_1,Unnamed: 2_level_1
C0-1,C9-9,1.0
C0-2,C9-9,1.0
C1-0,C9-9,0.748571
C1-1,C9-9,0.685714
C1-2,C9-9,0.778571


In [32]:
numCoverage = []
for l in np.arange(0.5, 1.0, 0.05):
    numCoverage.append(len(maxSimilarity[maxSimilarity.Sim >= l]) / 116 * 100)
    
trace = go.Scatter(
    x = np.arange(0.5, 1.0, 0.05),
    y = numCoverage,
    mode='lines+markers'
)

data = [trace]

layout = go.Layout(xaxis=dict(title='Minimum Similarity'), yaxis=dict(title='Coverage (%)'))
figure = go.Figure(data=data, layout=layout)

iplot(figure, filename='new_coverage')

## Modifying the number of cases in the case base

We recalculated the coverage again but now we considered different sizes for the case base. We start to measure the coverage with the original $CB^0$ and repeated it but removing one-by-one the cases with worst Elo rating.

In [33]:
simDF = pd.read_csv("data/similarityMatrixAllCases.csv", sep=";", header=None)
simDF.columns = ["Ci", "Cj", "Sim"]

simDF = simDF[simDF.Ci != simDF.Cj]

cases = json.load(open('data/cases.json'))
casesEloDF = pd.DataFrame(cases)

idsByElo = casesEloDF.sort_values(by=['elo_rating']).id.values

data = []
for l in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    coverage = []
    numCases = []
    
    for i in range(0, len(idsByElo)):
        numCases.append(len(idsByElo[i:]))
        filterCBR = simDF[simDF.Ci.isin(idsByElo[i:]) & simDF.Cj.isin(idsByElo[i:])]

        maxSimilarity = filterCBR.groupby(by='Ci').max()
        coverage.append(len(maxSimilarity[maxSimilarity.Sim >= l]) / len(idsByElo[i:]) * 100)
        
    trace = go.Scatter(
        x = numCases,
        y = coverage,
        mode='spline',
        name='θ = ' + str(l)
    )
    
    data.append(trace)

In [34]:
layout = go.Layout(xaxis=dict(title='Number of Cases'), yaxis=dict(title='Coverage (%)'))
figure = go.Figure(data=data, layout=layout)

iplot(figure, filename='new_coverage_2')

## Compare groups coverage

In [35]:
simDF = pd.read_csv("data/similarityMatrixWin.txt", sep=";", header=None)
simDF.columns = ["Ci", "Cj", "Sim"]

cases = simDF.Ci.unique()
densityCases = []
for c in cases:
    group_name = c[:c.find('-') + 1]
    densityCases.append([c, c[:c.find('-')]])
    
coverageDF = pd.DataFrame(densityCases, columns=['Case', 'Group'])
coverageDF.head()

Unnamed: 0,Case,Group
0,C0-1,C0
1,C0-2,C0
2,C1-0,C1
3,C1-1,C1
4,C1-2,C1


In [36]:
maxSimilarity = filterSimDF.groupby(by='Ci').max()
maxSimilarity.values[0]

array(['C9-9', 1.0], dtype=object)

In [37]:
group_i_array = []
group_j_array = []

for index, row in simDF.iterrows():
    group_i_array.append(row.Ci[:row.Ci.find('-')])
    group_j_array.append(row.Cj[:row.Cj.find('-')])
    
    


In [38]:
simDF = simDF.assign(group_i = group_i_array)
simDF = simDF.assign(group_j = group_j_array)
simGroupsDF = simDF[simDF.group_i == simDF.group_j]
simGroupsDF = simGroupsDF[simGroupsDF.Ci != simGroupsDF.Cj]
maxSimByGroup = simGroupsDF.groupby(by='Ci').max()

In [39]:
sim05 = []

for index, row in maxSimByGroup.iterrows():
    if row.Sim > 0.5:
        sim05.append(1)
    else:
        sim05.append(0)
        
maxSimByGroup = maxSimByGroup.assign(sim05=sim05)
maxSimByGroup.head()

Unnamed: 0_level_0,Cj,Sim,group_i,group_j,sim05
Ci,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C0-1,C0-2,1.0,C0,C0,1
C0-2,C0-1,1.0,C0,C0,1
C1-0,C1-4,0.748571,C1,C1,1
C1-1,C1-4,0.685714,C1,C1,1
C1-2,C1-4,0.778571,C1,C1,1


In [40]:
sim075 = []

for index, row in maxSimByGroup.iterrows():
    if row.Sim > 0.6:
        sim075.append(1)
    else:
        sim075.append(0)
        
maxSimByGroup = maxSimByGroup.assign(sim075=sim075)
maxSimByGroup.head()

Unnamed: 0_level_0,Cj,Sim,group_i,group_j,sim05,sim075
Ci,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C0-1,C0-2,1.0,C0,C0,1,1
C0-2,C0-1,1.0,C0,C0,1,1
C1-0,C1-4,0.748571,C1,C1,1,1
C1-1,C1-4,0.685714,C1,C1,1,1
C1-2,C1-4,0.778571,C1,C1,1,1


In [41]:
finalDF = maxSimByGroup.groupby(by='group_i').mean()

In [42]:
originalCover = [0.5       , 0.13323216, 0.21816578, 0.11361767, 0.12402122,
       0.11511408, 0.14405691, 0.33928571, 0.13066958, 0.14939063,
       0.18260997, 0.13354584, 0.20132275, 0.08904064, 0.33928571,
       0.13649954, 0.14916274, 0.1221884 , 0.24761905, 0.21428571,
       0.12885448, 0.1483268 , 0.12970289, 0.36428571, 0.23359199,
       0.31569737, 0.10379894, 0.20003421, 0.14717262]

In [43]:
finalDF = finalDF.assign(cover=originalCover)
finalDF.head()

Unnamed: 0_level_0,Sim,sim05,sim075,cover
group_i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0,1.0,1.0,1.0,0.5
C1,0.748,1.0,1.0,0.133232
C10,0.680423,1.0,1.0,0.218166
C11,0.628147,1.0,1.0,0.113618
C12,0.946449,1.0,1.0,0.124021


In [44]:
trace1 = go.Scatter(
    x=finalDF.index.values,
    y=finalDF.cover.values,
    mode='lines+markers',
    name='Density'
)
trace2 = go.Scatter(
    x=finalDF.index.values,
    y=finalDF.sim05.values,
    mode='lines+markers',
    name='θ = 0.5'
)

trace3 = go.Scatter(
    x=finalDF.index.values,
    y=finalDF.sim075.values,
    mode='lines+markers',
    name='θ = 0.6'
)

data = [trace1, trace2, trace3]
layout = go.Layout(xaxis=dict(title="Group Id."), yaxis=dict(title='Coverage'))
figure = go.Figure(data=data, layout=layout)

iplot(figure, filename='cover-per-group')