# EUR-LEX Case Sampling Notebook

### Abstract: 
#### This notebook samples cases for our study from the three chosen topics: Public Health, Data Protection & Social Policy

### Step 1. Find all case identifiers for cases concerning the three topics

In [26]:
# library for reading and writing CSV files
import csv

# Initialise variables to store case identifiers (CELEX numbers) for the three topics
publichealth = []
socialpolicy = []
dataprotection = []

# Find and store all case identifiers for the three topics
with open('../inputdata/all_cases_subjects.csv', newline='') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in csvreader:
        if ("health" in row[1].lower()):
            publichealth.append(row[0])
        if ("social policy" in row[1].lower()):
            socialpolicy.append(row[0])
        if ("data" in row[1].lower()):
            dataprotection.append(row[0])
            
# Remove any duplicates
publichealth = list(set(publichealth))
socialpolicy = list(set(socialpolicy))
dataprotection = list(set(dataprotection))

# Print the number of cases found in each topic 
print("# of Public Health cases found:",len(publichealth))
print("# of Social Policy cases found:",len(socialpolicy))
print("# of Data Protection cases found:",len(dataprotection))

# of Public Health cases found: 181
# of Social Policy cases found: 707
# of Data Protection cases found: 42


### Step 2. Identify all case citations that are relevant for each topic

In [27]:
# Import pandas Python library for manipulation and processing of tabular data
import pandas as pd

# Import (read) the file containing the citations for all extracted cases from EUR-LEX
citationsdata = pd.read_csv("../inputdata/all_cases_citations.csv") 
# Retain only those citations that involve cases in our chosen topics:
# 1. Public Health
publichealthcitations = citationsdata[citationsdata['target'].isin(publichealth)]
#print(len(publichealthcitations['target'].unique()))
# 2. Social Policy
socialpolicycitations = citationsdata[citationsdata['target'].isin(socialpolicy)]
#print(len(socialpolicycitations['target'].unique()))
# 3. Data Protection
dataprotectioncitations = citationsdata[citationsdata['target'].isin(dataprotection)]
#print(len(dataprotectioncitations['target'].unique()))

#print(dataprotectioncitations.head(20))

### Step 3. Identify uncited cases

##### I.e. cases from our original set which don't appear in 'citations.csv' (they are never cited). These uncited cases will be taken into account later on for sampling.

In [28]:
# Function to identify uncited cases
def get_uncited_cases(allcases, citedcases):
    return allcases.difference(citedcases)

# 1. Public Health
uncited_publichealth_cases = get_uncited_cases(set(publichealth), set(publichealthcitations['target'].unique()))
#print(len(list(uncited_publichealth_cases)))
# 2. Social Policy
uncited_socialpolicy_cases = get_uncited_cases(set(socialpolicy), set(socialpolicycitations['target'].unique()))
#print(len(list(uncited_socialpolicy_cases)))
# 3 Data Protection
uncited_dataprotection_cases = get_uncited_cases(set(dataprotection), set(dataprotectioncitations['target'].unique()))
#print(len(list(uncited_dataprotection_cases)))

### Step 4. Add the uncited cases into the case citations dataframes

In [29]:
# Function to sort and return a frame of case identifiers sorted by their number of citations - descending order
def get_descending_sorted_frame_of_citations(cases_dataframe):
    return cases_dataframe.groupby('target')['source'].count().reset_index(name='citations').sort_values('citations',ascending=False)

# Apply the above function to the citation frames for each topic
publichealth_citations_sorted_df = get_descending_sorted_frame_of_citations(publichealthcitations)
socialpolicy_citations_sorted_df = get_descending_sorted_frame_of_citations(socialpolicycitations)
dataprotection_citations_sorted_df = get_descending_sorted_frame_of_citations(dataprotectioncitations)

# Add uncited cases
# 1. Public Health
publichealth_newrows = []
for item in list(uncited_publichealth_cases):
    publichealth_newrows.append({'target' : item, 'citations' : 0})
ph_toadd = pd.DataFrame(publichealth_newrows)
# 2. Social Policy
socialpolicy_newrows = []
for item in list(uncited_socialpolicy_cases):
    socialpolicy_newrows.append({'target' : item, 'citations' : 0})
sp_toadd = pd.DataFrame(socialpolicy_newrows)
# 3. Data Protection
dataprotection_newrows = []
for item in list(uncited_dataprotection_cases):
    dataprotection_newrows.append({'target' : item, 'citations' : 0})
dp_toadd = pd.DataFrame(dataprotection_newrows)
    
publichealth_citations_sorted_df = publichealth_citations_sorted_df.append(ph_toadd, ignore_index=True, sort=False)
socialpolicy_citations_sorted_df = socialpolicy_citations_sorted_df.append(sp_toadd, ignore_index=True, sort=False)
dataprotection_citations_sorted_df = dataprotection_citations_sorted_df.append(dp_toadd, ignore_index=True, sort=False)

#print(len(publichealth_citations_sorted_df['target']))
#print(len(socialpolicy_citations_sorted_df['target']))
#print(len(dataprotection_citations_sorted_df['target']))

#print(publichealth_citations_sorted_df.tail(10))
#print(socialpolicy_citations_sorted_df.tail(10))
#print(dataprotection_citations_sorted_df.tail(10))

### Step 5. Now find the top n cited cases in each topic

In [30]:
# Function to get the top n cited cases
def find_top_n_cited_cases(cases_dataframe, n):
    return cases_dataframe.head(n)

print("Public Health:")
print("-------------")
print(find_top_n_cited_cases(publichealth_citations_sorted_df,3))
print()
print("Social Policy:")
print("--------------")
print(find_top_n_cited_cases(socialpolicy_citations_sorted_df,3))
print()
print("Data Protection:")
print("---------------")
print(find_top_n_cited_cases(dataprotection_citations_sorted_df,3))
print()

Public Health:
-------------
        target  citations
0  62003CJ0453         32
1  62004CJ0372         24
2  61988CJ0070         12

Social Policy:
--------------
        target  citations
0  61990CJ0006         43
1  61984CJ0152         40
2  62006CJ0268         40

Data Protection:
---------------
        target  citations
0  62000CJ0465         17
1  62009CJ0092         16
2  62001CJ0101         15



### Step 6. Identify the bottom 3 cited cases in each topic

In [31]:
# Function to get the top n cited cases
def find_bottom_n_cited_cases(cases_dataframe, n):
    return cases_dataframe.tail(n)

print("Public Health:")
print("-------------")
print(find_bottom_n_cited_cases(publichealth_citations_sorted_df,3))
print()
print("Social Policy:")
print("--------------")
print(find_bottom_n_cited_cases(socialpolicy_citations_sorted_df,3))
print()
print("Data Protection:")
print("---------------")
print(find_bottom_n_cited_cases(dataprotection_citations_sorted_df,3))
print()

Public Health:
-------------
          target  citations
178  62016CO0663          0
179  62015CJ0138          0
180  62011CJ0520          0

Social Policy:
--------------
          target  citations
704  61999CJ0026          0
705  61989CJ0051          0
706  61996CJ0106          0

Data Protection:
---------------
         target  citations
39  62015CJ0536          0
40  62013CO0683          0
41  62015CJ0398          0



### Step 7. Try binning the cases into quantiles (according to their number of citations)

#### Step 7a. First compute the bins based on the citation count

In [32]:
# Function to find the (.1, .25, .5, .75) quantiles for the citation numbers
def find_citation_quantiles(cases_dataframe):
    return cases_dataframe.quantile([0, .1, .25, .5, .75, .9, 1], axis = 0) 

# Function to sort and return a frame of cases with their number of citations
def get_descending_sorted_frame_of_citations(cases_dataframe):
    return cases_dataframe.groupby('target')['source'].count().reset_index(name='citations').sort_values('citations',ascending=False)

publichealth_citation_quantiles = find_citation_quantiles(publichealth_citations_sorted_df)
socialpolicy_citation_quantiles = find_citation_quantiles(socialpolicy_citations_sorted_df)
dataprotection_citation_quantiles = find_citation_quantiles(dataprotection_citations_sorted_df)

print("Public Health:")
print("-------------")
print(publichealth_citation_quantiles)
print()
print("Social Policy:")
print("--------------")
print(socialpolicy_citation_quantiles)
print()
print("Data Protection:")
print("---------------")
print(dataprotection_citation_quantiles)
print()

Public Health:
-------------
      citations
0.00        0.0
0.10        0.0
0.25        0.0
0.50        1.0
0.75        2.0
0.90        5.0
1.00       32.0

Social Policy:
--------------
      citations
0.00        0.0
0.10        0.0
0.25        1.0
0.50        2.0
0.75        5.0
0.90        9.0
1.00       43.0

Data Protection:
---------------
      citations
0.00       0.00
0.10       0.00
0.25       1.00
0.50       2.50
0.75       5.75
0.90      10.90
1.00      17.00



#### Step 7b. populate the bins with cases (actually do the sample)

In [47]:
import random
import csv

def get_quantile_pairs(quantile_range):
    if len(quantile_range) < 2:
        if len(quantile_range) == 1:
            quantile_range.insert(0,quantile_range[0])
            return quantile_range
        else:
            return quantile_range
    else:
        result = []
        index = len(quantile_range)-1
        done = False
        while not done:
            current_quantile_pair = []
            current_quantile_pair.append(quantile_range[index-1])
            current_quantile_pair.append(quantile_range[index])
            result.append(current_quantile_pair)
            index -= 1
            if index == 0:
                done = True
        return result
            
def randomly_sample_case(sourcecases_df, lowerbound, upperbound):
    index_range = sourcecases_df.index[sourcecases_df['citations'].between(lowerbound, upperbound, inclusive=True)].tolist()
    done = False
    index = 0
    while not done:
        index = random.randint(index_range[0],index_range[len(index_range)-1]+1)   
        try:
            row = sourcecases_df.loc[ index , : ]
            done = True                
        except Exception as e:
            done = False
    return index
    
def pick_cases(sourcecases_df, citation_quantiles, sample_size, quantile_range, topic):
    quantile_range_pairs = get_quantile_pairs(quantile_range)
    sample = []
    for quantile_pair in quantile_range_pairs:
        current_bin_sample = []
        current_bin_rows = []
        lowerbound = citation_quantiles['citations'][quantile_pair[0]]
        upperbound = citation_quantiles['citations'][quantile_pair[1]]
        while len(current_bin_sample) < sample_size:
            random_index = randomly_sample_case(sourcecases_df, lowerbound, upperbound)
            random_row = sourcecases_df.loc[ random_index , : ]
            if (random_row['citations'] >= lowerbound and random_row['citations'] <= upperbound):
                current_bin_sample.append([random_row['target'],random_row['citations']])
                current_bin_sample = [list(i) for i in set(map(tuple, current_bin_sample))]
        for item in current_bin_sample:
            current_bin_row = []
            current_bin_row.append(quantile_pair[1])
            current_bin_row.append(item[0])
            current_bin_row.append(item[1])
            current_bin_row.append(topic)
            current_bin_rows.append(current_bin_row)
            sample.extend(current_bin_rows)
    return sample
            
sample_cases = []

sample_cases.extend(pick_cases(publichealth_citations_sorted_df, publichealth_citation_quantiles, 3, [0, .1, .25, .5, .75, .9, 1], 'public health'))
sample_cases.extend(pick_cases(socialpolicy_citations_sorted_df, socialpolicy_citation_quantiles, 3, [0, .1, .25, .5, .75, .9, 1], 'social policy'))
sample_cases.extend(pick_cases(dataprotection_citations_sorted_df, dataprotection_citation_quantiles, 3, [0, .1, .25, .5, .75, .9, 1], 'data protection'))
sample_cases = [list(i) for i in set(map(tuple, sample_cases))]
sample_cases.insert(0, ['quantile','source','citations','topic'])

with open('../inputdata/sampled_cases.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile, delimiter=',')
    writer.writerows(sample_cases)
