In [20]:
import pandas as pd
import glob

In [21]:
# join multiple csv files into one 
df = pd.concat([pd.read_csv(f) for f in glob.glob('./predictions/*.csv')], ignore_index = True)
df

Unnamed: 0,ID,PREDICTED
0,0,neutral
1,1,neutral
2,2,neutral
3,3,neutral
4,4,neutral
...,...,...
1495,1195,neutral
1496,1196,positive
1497,1197,neutral
1498,1198,neutral


In [22]:
gt = pd.read_csv('./gt.csv')
gt

Unnamed: 0,id,text,oracle
0,6,But sadly this is not working.,-1
1,78,"So, everything builds fine, but when we try to...",-1
2,90,That is what is causing your null pointer exce...,-1
3,139,"All attempts I've made were, in a shortcut, un...",-1
4,162,Don't use.,-1
...,...,...,...
1495,6184,Very good example of steady pooling readHere.,1
1496,6238,Now we're getting to the good part.,1
1497,6308,So far i've done this for Twitter and it works...,1
1498,6328,I solved the earlier problem.,1


In [23]:
# convert df's column predicted values from (negative, neutral and positive) to (-1, 0, 1)
df['PREDICTED'] = df['PREDICTED'].map({'negative': -1, 'neutral': 0, 'positive': 1})
df

Unnamed: 0,ID,PREDICTED
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
1495,1195,0
1496,1196,1
1497,1197,0
1498,1198,0


In [24]:
# refer the predicted values with the ground truth values, estimate a confusion matrix
df['GT'] = gt['oracle']
df

Unnamed: 0,ID,PREDICTED,GT
0,0,0,-1
1,1,0,-1
2,2,0,-1
3,3,0,-1
4,4,0,-1
...,...,...,...
1495,1195,0,1
1496,1196,1,1
1497,1197,0,1
1498,1198,0,1


In [25]:
# calculate the accuracy
df['ACCURACY'] = df['PREDICTED'] == df['GT']

# calculate the percentage of accuracy
df['ACCURACY'].value_counts(normalize=True) * 100

True     73.866667
False    26.133333
Name: ACCURACY, dtype: float64

In [28]:
# calculate the true positive, true negative, false positive and false negative
df['TP'] = (df['PREDICTED'] == 1) & (df['GT'] == 1)
df['TN'] = (df['PREDICTED'] == -1) & (df['GT'] == -1)
df['FP'] = (df['PREDICTED'] == 1) & (df['GT'] == -1)
df['FN'] = (df['PREDICTED'] == -1) & (df['GT'] == 1)

# calculate tp rate, tn rate, fp rate and fn rate
tp = df['TP'].value_counts(normalize=True) * 100
tn= df['TN'].value_counts(normalize=True) * 100
fp = df['FP'].value_counts(normalize=True) * 100
fn = df['FN'].value_counts(normalize=True) * 100

# make a table of tp, tn, fp and fn
table = pd.DataFrame({'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn})

display(table)


Unnamed: 0,TP,TN,FP,FN
False,99.6,99.0,99.733333,100.0
True,0.4,1.0,0.266667,
