<a href="https://colab.research.google.com/github/SusannaValentina/Fairness/blob/master/Scraping/comfrontareClassifiche.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
def RBO(l1, l2, p):
    """
        Calculates Ranked Biased Overlap (RBO) score. 
        l1 -- Ranked List 1
        l2 -- Ranked List 2
    """
    if l1 == None: l1 = []
    if l2 == None: l2 = []
    
    sl,ll = sorted([(len(l1), l1),(len(l2),l2)])
    s, S = sl # s = length of smaller list, S = Smaller List
    l, L = ll # l = length of longer list, L = Longer list
    if s == 0: return 0

    # Calculate the overlaps at ranks 1 through l 
    # (the longer of the two lists)
    ss = set([]) # contains elements from the smaller list till depth i
    ls = set([]) # contains elements from the longer list till depth i
    x_d = {0: 0} # overlap holds number of common elements at depth d
    sum1 = 0.0
    for i in range(l):
        x = L[i]
        y = S[i] if i < s else None
        d = i + 1
        
        # if two elements are same then 
        # we don't need to add to either of the set
        if x == y: 
            x_d[d] = x_d[d-1] + 1.0
        # else add items to respective list
        # and calculate overlap
        else: 
            ls.add(x) 
            if y != None: ss.add(y)
            x_d[d] = x_d[d-1] + (1.0 if x in ss else 0.0) + (1.0 if y in ls else 0.0)     
        #calculate average overlap
        sum1 += x_d[d]/d * pow(p, d)
        
    sum2 = 0.0
    for i in range(l-s):
        d = s+i+1
        sum2 += x_d[d]*(d-s)/(d*s)*pow(p,d)

    sum3 = ((x_d[l]-x_d[s])/l+x_d[s]/s)*pow(p,l)

    # Equation 32
    rbo_ext = (1-p)/p*(sum1+sum2)+sum3
    return rbo_ext

In [0]:
import pandas as pd
import numpy as np
import time
from scipy.stats._stats import _kendall_dis
from math import sqrt, fsum
from operator import itemgetter
from itertools import combinations, permutations
import numpy as np
import pandas as pd
import scipy.stats

def prova (xs,ys):
# Calculate the rank of x's
  xranks = pd.Series(xs).rank()
    
# Caclulate the ranking of the y's
  yranks = pd.Series(ys).rank()
    
# Calculate Pearson's correlation coefficient on the ranked versions of the data
  print(scipy.stats.pearsonr(xranks, yranks))

  return scipy.stats.spearmanr(xs, ys)

def _mean(m):
    return fsum(m) / len(m)


def _fancy(m):
    a = sum(m)
    b = 0
    for i in m:
        b += i * i
    return len(m) * b - (a ** 2)


def _rank(m):
    (ivec, svec) = zip(*sorted(list(enumerate(m)), key=itemgetter(1)))
    sumranks = 0
    dupcount = 0
    newlist = [0] * len(m)
    for i in range(len(m)):
        sumranks += i
        dupcount += 1
        if i == len(m) - 1 or svec[i] != svec[i + 1]:
            averank = sumranks / float(dupcount) + 1
            for j in range(i - dupcount + 1, i + 1):
                newlist[ivec[j]] = averank
            sumranks = 0
            dupcount = 0
    return newlist


def _concordance(m, n):
    """ 
    returns count of concordant, discordant, and tied pairs
    """
    if len(m) != len(n):
        raise ValueError #, 'Iterables (m, n) must be the same length'
    c = 0
    d = 0
    iss = 0
    for (i, j) in combinations(range(len(m)), 2):
        m_dir = m[i] - m[j]
        n_dir = n[i] - n[j]
        sign = m_dir * n_dir
        if sign: # not a tie
            c += 1
            d += 1
            if sign > 0:
                iss += 1
            elif sign < 0:
                iss -= 1
        else:
            if m_dir:
                c += 1
            elif n_dir:
                d += 1
            # else is a tie in both ways and of no concern to us
    return (c, d, iss)


## USER FUNCTIONS

def pearson_rho(m, n):
    """ 
    return the Pearson rho coefficient; based off stats.py 
    >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4]
    >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3]
    >>> pearson_rho(x, y)
    0.9245404356092288
    """
    if len(m) != len(n):
        raise ValueError #, 'Iterables (m, n) must be the same length'
    num = len(m) * (sum([i * j for i, j in zip(m, n)])) - sum(m) * sum(n)
    return num / sqrt(_fancy(m) * _fancy(n))


def spearman_rho(m, n):
    """ 
    return Spearman's rho; based off stats.py 
    >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4]
    >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3]
    >>> spearman_rho(x, y)
    0.9363636363636364
    """
    if len(m) != len(n):
        raise ValueError #, 'Iterables (m, n) must be the same length'
    dsq = sum([(mi - ni) ** 2 for (mi, ni) in zip(_rank(m), _rank(n))])
    return 1. - 6. * dsq / float(len(m) * (len(n) ** 2 - 1.))


def spearman_rho_tr(m, n):
    """ 
    rho for tied ranks, checked by comparison with Pycluster
    >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4]
    >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3]
    >>> spearman_rho_tr(x, y)
    0.9348938334114621
    """
    m = _rank(m)
    n = _rank(n)
    num = 0.
    den_m = 0.
    den_n = 0.
    m_mean = _mean(m)
    n_mean = _mean(n) 
    for (i, j) in zip(m, n):
        i = i - m_mean
        j = j - n_mean
        num += i * j 
        den_m += i ** 2
        den_n += j ** 2
    return num / sqrt(den_m * den_n)


def goodman_kruskal_gamma(m, n):
    """ 
    compute the Goodman and Kruskal gamma rank correlation coefficient; 
    this statistic ignores ties is unsuitable when the number of ties in the
    data is high. it's also slow. 
    >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4]
    >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3]
    >>> goodman_kruskal_gamma(x, y)
    0.9166666666666666
    """
    num = 0
    den = 0
    for (i, j) in permutations(range(len(m)), 2):
        m_dir = m[i] - m[j]
        n_dir = n[i] - n[j]
        sign = m_dir * n_dir
        if sign > 0:
            num += 1
            den += 1
        elif sign < 0:
            num -= 1
            den += 1
    return num / float(den)


def kendall_tau_b(m, n):
    """ 
    compute Kendall's rank correlation coefficient tau_b; based on stats.py,
    but fixes a major bug in that code (as well as the scipy implementation); 
    the results returned here accord with STATA/SPSS/SAS/R 
    >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4]
    >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3]
    >>> kendall_tau_b(x, y)
    0.8629109946080097
    """
    (c, d, iss) = _concordance(m, n)
    return iss / sqrt(c * d)


def kendall_tau_c(m, n):
    """ 
    compute Kendall's rank correlation coefficient tau_c
    >>> x = [2, 8, 5, 4, 2, 6, 1, 4, 5, 7, 4]
    >>> y = [3, 9, 4, 3, 1, 7, 2, 5, 6, 8, 3]
    >>> kendall_tau_c(x, y)
    0.8484848484848485
    """
    (c , d, iss) = _concordance(m, n)
    min_dim = min(len(set(m)), len(set(n)))
    return (2. * min_dim * iss) / ((min_dim - 1) * len(m) ** 2)


if __name__ == '__main__':
    from doctest import testmod
    testmod()



# import data
#df_ = pd.read_csv('Data/CR_mockData_EAD.csv')
#df = df_[['realized_ead', 'pred_value']][0:100]

def SomersD(x, y):

    x = np.asarray(x).ravel()
    y = np.asarray(y).ravel()

    if x.size != y.size:
        raise ValueError("All inputs must be of the same size, "
                         "found x-size %s and y-size %s" % (x.size, y.size))

    def count_rank_tie(ranks):
        cnt = np.bincount(ranks).astype('int64', copy=False)
        cnt = cnt[cnt > 1]
        return ((cnt * (cnt - 1) // 2).sum(),
            (cnt * (cnt - 1.) * (cnt - 2)).sum(),
            (cnt * (cnt - 1.) * (2*cnt + 5)).sum())

    size = x.size
    perm = np.argsort(y)  # sort on y and convert y to dense ranks
    x, y = x[perm], y[perm]
    y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp)

    # stable sort on x and convert x to dense ranks
    perm = np.argsort(x, kind='mergesort')
    x, y = x[perm], y[perm]
    x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp)

    dis = _kendall_dis(x, y)  # discordant pairs

    obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True]
    cnt = np.diff(np.where(obs)[0]).astype('int64', copy=False)

    ntie = (cnt * (cnt - 1) // 2).sum()  # joint ties
    xtie, x0, x1 = count_rank_tie(x)     # ties in x, stats
    ytie, y0, y1 = count_rank_tie(y)     # ties in y, stats

    tot = (size * (size - 1)) // 2

    # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
    #               = con + dis + xtie + ytie - ntie
    #con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
    SD = (tot - xtie - ytie + ntie - 2 * dis) / (tot - ntie)
    return (SD, dis)




def score(l1, l2, depth = 10):
    """
        Calculates Average Overlap score. 
        l1 -- Ranked List 1
        l2 -- Ranked List 2
        depth -- depth
    """
    if l1 == None: l1 = []
    if l2 == None: l2 = []

    sl, ll = sorted([(len(l1), l1),(len(l2),l2)])
    s, S = sl  # s = length of smaller list, S = Smaller List
    l, L = ll  # l = length of longer list, L = Longer list
    #sanity check
    if s == 0: return 0
    depth = depth if depth < l else l
    
    # Calculate fraction of overlap from rank  at ranks 1 through depth
    # (the longer of the two lists)
    ss = set([])
    ls = set([])
    overlap = {0: 0}  # overlap holds number of common elements at depth d 
    sum1 = 0.0  

    for i in range(depth):
        # get elements from the two list
        x = L[i]
        y = S[i] if i < s else None
        depth = i+1
        # if the two elements are same, then we don't need
        # to them to the list and just increment the 
        if x == y: 
            overlap[depth] = overlap[i] + 2
        #else add items to the two list
        else:
            ls.add(x)
            if y != None: ss.add(y)
            overlap[depth] = overlap[i] + (2 if x in ss else 0) + (2 if y in ls else 0) 
        sum1 = sum1 + float(overlap[depth])/(len(S[0:depth]) + depth)

    return sum1/depth

    
#start_time = time.time()
#SD, dis = SomersD(df.realized_ead, df.pred_value)
#print("--- %s seconds ---" % (time.time() - start_time))

In [0]:
import csv
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials
from google.colab import auth
import json


auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Restaurant_Info = '15LwpZqBUvtbuePS-7K0ezRtfUecInjx9'
download = drive.CreateFile({'id': Restaurant_Info})
download.GetContentFile('Tupelo Honey_date_desc.csv')

Restaurant_RBO = '1aOTHWSNdkjzW7EyyaomdXxgnzjXdqN7S'
download = drive.CreateFile({'id': Restaurant_RBO})
download.GetContentFile('Tupelo Honey_relevance_desc.csv')

with open('Tupelo Honey_date_desc.csv') as data:
    csv_reader = csv.reader(data, delimiter=',')
    line_count = 0
    a = []
    for riga in csv_reader:
      #se è l'header vai alla riga successiva
      if(line_count <= 0):
        line_count = line_count + 1
      else:
        a.append(riga[0])

with open('Tupelo Honey_relevance_desc.csv') as yelp:
    csv_reader = csv.reader(yelp, delimiter=',')
    line_count = 0
    b = []
    for riga in csv_reader:
      #se è l'header vai alla riga successiva
      if(line_count <= 0):
        line_count = line_count + 1
      else:
        b.append(riga[0])
        

c = [2, 8, 2, 4, 1, 6, 1, 4, 5, 7, 4]
d = [1, 3, 5, 6, 1, 6, 1, 4, 5, 7, 4]

#print(pearson_rho(a,b))
#print(spearman_rho(a,b))
print(spearman_rho_tr(a,b))
print(score(a,b))
print(RBO(a,b,0.93))
#print(prova(a,b))
#print(goodman_kruskal_gamma(a,b))
#print(kendall_tau_b(a,b))
#print(kendall_tau_c(a,b))
#print(SomersD(a,b)[0])