In [None]:
import requests
import urllib3
import ssl
import pandas as pd
from lxml import html
from lxml.etree import tostring


class CustomHttpAdapter (requests.adapters.HTTPAdapter):
    # "Transport adapter" that allows us to use custom ssl_context.

    def __init__(self, ssl_context=None, **kwargs):
        self.ssl_context = ssl_context
        super().__init__(**kwargs)

    def init_poolmanager(self, connections, maxsize, block=False):
        self.poolmanager = urllib3.poolmanager.PoolManager(
            num_pools=connections, maxsize=maxsize,
            block=block, ssl_context=self.ssl_context)


def get_legacy_session():
    ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
    ctx.options |= 0x4  # OP_LEGACY_SERVER_CONNECT
    session = requests.session()
    session.mount('https://', CustomHttpAdapter(ctx))
    return session

def get_alliance_disease(code:str='DOID:9970',entity:str='models'):

    url = f'https://www.alliancegenome.org/api/disease/DOID:0060643/models?page=1&limit=100000&sortBy='
    
    with(
        get_legacy_session() as s,
        s.get(url) as response
    ):
        if response.status_code == 200:
            data = response.json()
        else:
            data = None
            
    return data


def filter_diseases(json_data):
    """
    Filters the JSON data to only take entries with the category "disease"
    
    """
    filtered_data = []
    for result in json_data['results']:
        if result['category'] == 'disease':
            filtered_data.append(result)
    return filtered_data


def get_disease_id(disease_name:str):
    
    url = f'https://www.alliancegenome.org/api/search_autocomplete?q={disease_name}'
    with (
        get_legacy_session() as s,
        s.get(url) as response
    ):
        if response.status_code == 200:
            data = response.json()
            data = filter_diseases(data)
            if len(data)>0:
                data = data[0]['primaryKey']
            else:
                data = None
        else:
            data = None
    #print(data)
    #return {disease_name:data}
    return data

def get_multiple_diseases(disease_list,entity:str='models'):
    
    temp=[]
    #print(disease_list)
        
    for disease in disease_list:
        #print(disease)
        data = get_alliance_disease(code=disease,entity=entity)
     # print(data)
        data = pd.json_normalize(
        data, record_path=["results"]
        )
        temp.append(data)
        
  #  print(temp)
    results = pd.concat(temp, ignore_index=True)
    
    return results      
      

def get_multiple_diseases_from_name(disease_list,entity:str='models'):
    
    temp=[]
   
    for disease in disease_list:
        data = get_disease_id(disease)
        temp.append(data)
       
    #disease_data = get_multiple_diseases(temp)
      
    return data

def get_multiple_diseases(disease_list,entity:str='models'):
    
    temp=[]
    
    for disease in disease_list:
      #  print(disease)
        data = get_alliance_disease(code=disease,entity=entity)
       # print(data)
        data = pd.json_normalize(
        data, record_path=["results"]
        )
        temp.append(data)
        
  #  print(temp)
    results = pd.concat(temp, ignore_index=True)
    
    return results 


## MGI Phenotypes extraction

from lxml import html
import requests
import pandas as pd
 
def get_multiple_mouse_models(mouse_list):
    
    temp=[]
    
    for model in mouse_list:
        model_id = model
      #  data = parse_mice(model)    
        page = requests.get(f'http://www.informatics.jax.org/allele/genoview/{model}')
        tree = html.fromstring(page.content)
        # Get ids using XPath
        ids = tree.xpath('//div[@class="mpSystemRow"]/@id')
        text_ids = tree.xpath('//div[@class="mpSystemRow"]/text()[normalize-space()]')
        text_ids =[text.strip() for text in text_ids]
        list_df = []
        sex = []
        # For each ids get content
        for id, text in zip(ids, text_ids):
            data = {}
            subtext = tree.xpath(f'(//div[contains(@id, "{id}") and @class="termDiv"]//text()[normalize-space()])')
            divs = tree.xpath(f'//div[contains(@id, "{id}") and @class="termDiv"]')
            for div in divs:
                img = div.xpath('.//img[contains(@class, "mp_glyph")]/@src')
                if len(img) >0:
                    if "Mars" in img[0]:
                        sex.append("Male")
                    else:
                        sex.append("Female")
                else:
                    sex.append("Neutral")
            cleaned = [text.strip() for text in subtext]
            indices = [i for i, s in enumerate(cleaned) if 'J:' in s]
            features = [cleaned[i-2] for i in indices]
            feautres_jterms = [cleaned[i] for i in indices]
            texts = [cleaned[indices[i]+2:indices[i+1]-2] for i in range(len(indices)-1)]
            texts.extend([cleaned[indices[-1]+2:]])
            system = [text for i in range(len(texts))]
            request_ids = [model for i in range(len(texts))]
            data['model']=model_id
            data['system'] = system
            data['features_jterms'] = feautres_jterms
            data['features'] = features
            data['comments'] = texts
            #data['sex'] = sex
            list_df.append(pd.DataFrame(data))
        df = pd.concat(list_df, ignore_index=True)
        
        temp.append(df)
        
  
    results = pd.concat(temp, ignore_index=True)
    
    return results 

def parse_first_table(request_id='MGI:6719082'):
    page = requests.get(f'http://www.informatics.jax.org/allele/genoview/{request_id}')
    tree = html.fromstring(page.content) 
    table = tree.xpath('//*[@id="templateBodyInsert"]/div[2]/div/div[2]/table')[0]
    table = pd.concat(pd.read_html(html.etree.tostring(table)))
    table = table.reset_index(drop=True)
    table = table[1:].T
    table.columns = table.iloc[0] 
    table = table[1:].dropna()
    table['Allelic Composition'] = table['Allelic Composition'].apply(lambda x: x.split(' '))
    table['Genetic Background'] = table['Genetic Background'].apply(lambda x: x.replace('involves:', '').split('*'))
    table['Find Mice'] = table['Find Mice'].apply(lambda x: x.split('Mouse lines carrying:')[1].split(';'))
    return table
