In [None]:
import pandas as pd
import glob
import json
import numpy as np

In [None]:
# Load scorefile metadata
with open('log_scorefiles.json', 'r') as jsonfile:
    json_scores = json.load(jsonfile)

# Load logs
paths_log = glob.glob("*_summary.csv")
log = pd.concat([pd.read_csv(x) for x in paths_log])

# Load PGS data
paths_pgs = glob.glob("*_pgs.txt.gz")
pgs = pd.concat([pd.read_csv(x, index_col=[0,1], sep='\t') for x in paths_pgs])

# Load PCA/pop sim
paths_popsim = glob.glob("*_popsimilarity.txt.gz")
popsim = pd.concat([pd.read_csv(x, index_col=[0,1], sep='\t') for x in paths_popsim])

# Pipeline Command

In [None]:
! cat command.txt

# Scoring file metadata

> Additional [documentation](https://pgsc-calc.readthedocs.io/en/latest/output.html#report) is available that explains some of the terms used this report in more detail

## Scoring Files

In [None]:
metadata_scores = {}

for scorefile, metadata in json_scores.items():
    result = {'Polygenic Score ID': np.nan, 'Publication': np.nan, 'Trait': np.nan, 'Number of variants': int(metadata['variants_number']), 'Genome build': np.nan}
    
    # pgs_id
    pgs_id = ''
    if pd.isnull(metadata['pgs_id']) == False:
        pgs_id += '<a href="https://www.pgscatalog.org/score/{}">{}</a>'.format(metadata['pgs_id'], metadata['pgs_id'])
    
    if pd.isnull(metadata['pgs_name']) == False:
        if pgs_id == '':
            pgs_id = metadata['pgs_name']
        else:
            pgs_id += '<br><small>({})</small>'.format(metadata['pgs_name'])
    result['Polygenic Score ID'] = pgs_id
    
    # pgp_id
    pgp_id = ''
    if pd.isnull(metadata['pgp_id']) == False:
        pgp_id += '<a href="https://www.pgscatalog.org/publication/{}">{}</a>'.format(metadata['pgp_id'], metadata['pgp_id'])
    
    if pd.isnull(metadata['citation']) == False:
        if pgp_id != '':
            pgp_id += "<br>"
        
        pgp_id += '<small>{}</small>'.format(metadata['citation'])
    
    if pgp_id != '':
        result['Publication'] = pgp_id
        
    # trait
    # Trait
    trait_mapped = ''
    if 'trait_efo' in metadata:
        urls = []
        for efo_id, trait_name in zip(metadata['trait_efo'], metadata['trait_mapped']):
            urls.append('<a href="http://www.ebi.ac.uk/efo/{}">{}</a>'.format(efo_id, trait_name))
        trait_mapped = '<u>Mapped trait</u>: '
        trait_mapped += ', '.join(urls)
        
    if pd.isnull(metadata['trait_reported']) == False:
        trait_reported = '<u>Reported trait</u>: {}'.format(metadata['trait_reported'])
    
        if trait_mapped == '':
            result['Trait'] = trait_reported
        else:
            result['Trait'] = trait_reported + '<br>' + trait_mapped
    
    
    # Genome build
    build_info = '<u>Reported</u>: {}'.format(metadata['genome_build'])
    if metadata['use_harmonised'] is True:
        build_info += '<br><u>Harmonized Build</u>: {}'.format(metadata['HmPOS_build'])
    result['Genome build'] =  build_info
  
    metadata_scores[scorefile] = result
    
    
metadata_scores = pd.DataFrame.from_dict(metadata_scores, orient='index')
metadata_scores.index.name = 'Scoring File'
metadata_scores

## Variant matching

### Parameters

In [None]:
! cat params.txt

### Summary

In [None]:
summary = {}

def SummarizeMatching(df):
    n_matched = df.loc[df.match_status == 'matched', 'count'].sum()
    n_unmatched = df.loc[df.match_status != 'matched', 'count'].sum()
    n = n_matched + n_unmatched
    
    return pd.Series({'perc_matched': 100*n_matched/n, 'n_matched': n_matched, 'n_unmatched': n_unmatched, 'n': n})

agg_log = log.groupby(['dataset', 'accession', 'score_pass', 'match_status'])[['count']].sum().reset_index()    
summary = agg_log.groupby(['dataset', 'accession', 'score_pass']).apply(SummarizeMatching).reset_index()

#Reorder columns & rows
summary = summary[['dataset', 'accession', 'n', 'score_pass', 'perc_matched', 'n_matched', 'n_unmatched']].sort_values('perc_matched', ascending=False)

# Rename columns
names_summary = {"dataset": "Sampleset", "accession": "Scoring file", 
                 "n": "Number of variants", 
                 "score_pass": "Passed matching", "percent_matched": "Match %", 
                 "n_matched": "Total matched", "n_unmatched": "Total unmatched"
                }
summary = summary.convert_dtypes().rename(names_summary, axis='columns')
summary

### Detailed Results

In [None]:
names_log = {"dataset": "Sampleset", "accession": "Scoring file",
             'match_status': "Match type", "n": "Number of variants", 
             "duplicate_best_match": "Multiple potential matches",
             "duplicate_ID": "Duplicated matched variants",
             "ambiguous": "Ambiguous",
             "match_flipped" : "Flipped Match",
             "match_IDs" : "Matches Reference IDs",
             "is_multiallelic": "Multiallelic" ,
             'count' : 'N',
             "percent": "%"
            }

log['match_status'] = log['match_status'].astype('category')
log['match_status'] = log['match_status'].cat.reorder_categories(["matched", "excluded", "unmatched"])
log = log.sort_values(['accession', 'match_status'])
log = log.rename(names_log, axis='columns')
log

# Scores