# Webscraper <br>
Retrieves all linked PXD-entries of a certain cell line in Cellosaurus

In [2]:
#python webscrawler for Cellosaurus
import os, re
import pandas as pd
from bs4 import BeautifulSoup
import requests

base_url = 'https://web.expasy.org/cgi-bin/cellosaurus/search?input='

In [3]:
os.chdir(r"C:/Users/samva/Master_thesis_folder/Misidentified cell lines")
df = pd.read_csv("annotation_excel_use2.csv", sep = ";")
df.dropna()
cell_lines = list(df.loc[df.valid == "yes"].cell_line.unique())
cell_lines = [x.split("/")[0].split()[0] for x in cell_lines]
cell_lines = list(set(cell_lines))
cell_lines[0:5]

['MDA-MB-231', 'cerebral', 'BeWo', 'N2a', 'iPSC']

In [4]:
#returns list of links of the query
def get_links(base, cell_line):
    url = base_url + f"%22{cell_line}%22"
    page = requests.get(url)
    bs = BeautifulSoup(page.text, "html.parser")
    links = bs.find_all("a", href = re.compile("^(/cellosaurus/)"))
    return [x["href"] for x in links]

In [19]:
#Extracts all info needed from CVCL-page and returns as [cell_name, PXD_list, parent_link]
def get_info(base, cvcl_link):
    PXD_list, parent = False, False
    
    url = "https://web.expasy.org" + cvcl_link
    page = requests.get(url)
    bs = BeautifulSoup(page.text, "html.parser")
    
    if bs.table == None:
        return False
        
    for i in bs.table.find_all("tr"):
        if i.th != None:
            if i.th.text == "Cell line name":
                cell_name = i.td.text
            
            if i.th.text == "Species of origin":
                if "Homo sapiens" not in i.td.text:
                    print("Not human")
                    return False
                    
            if i.th.text == "Hierarchy":
                if i.td.text.startswith("Parent:"):
                    parent = i.td

            if i.th.text == "Proteomic databases":
                PXD_list = [x.text for x in i.find_all("a")]

    print("Name:", cell_name)     
    if PXD_list:
        print(f"\t{len(PXD_list)} PXD entries for {cell_name}")
    if parent:
        p_text, p_link = parent.text, parent.a['href']
        print(f"\tWARNING: {p_text}\n\tlink: {p_link}")
        parent = p_link
        
    return [cell_name, PXD_list, parent]

In [6]:
def dataframe_converter(list_of_lists):
    cell_names, PXD_list, parent = [x[0] for x in list_of_lists], [x[1] for x in list_of_lists], [x[2] for x in list_of_lists]
    return pd.DataFrame({"Cell_name": cell_names, "PXD_list": PXD_list, "parent": parent})

In [7]:
list_of_links = []
for i in cell_lines:
    links_of_query = get_links(base_url, i)
    list_of_links.append(links_of_query)
    print(f"{i}: {len(links_of_query)}")
print("Done.")

unique_links = []
reduced = 0
for i in list_of_links:
    for x in i:
        if x not in unique_links:
            unique_links.append(x)
        else:
            reduced += 1
print(f"{reduced} omitted.\n{len(unique_links)} extracted.")

MDA-MB-231: 70
cerebral: 158
BeWo: 6
N2a: 9
iPSC: 3622
SKBR3: 15
HDF: 19
JURKAT: 118
KBM7: 2
A2780: 45
HT29: 43
THP1: 54
HTR8: 2
SH-SY5Y: 27
HL60: 26
CAF: 9
HepG2: 67
ZR751: 1
HEK293: 750
hNPC: 1
TCam-2: 4
HuH7: 18
HeLa: 1779
HEK293F: 6
SK-N-BE2: 1
MCF7: 86
neuron: 1171
MDA-MB-231-CSC: 1
LN308: 1
KO52: 1
Done.
363 omitted.
7749 extracted.


In [8]:
unique_links[0:10]

['/cellosaurus/CVCL_EQ20',
 '/cellosaurus/CVCL_5T77',
 '/cellosaurus/CVCL_A9BP',
 '/cellosaurus/CVCL_A9BQ',
 '/cellosaurus/CVCL_YJ86',
 '/cellosaurus/CVCL_0031',
 '/cellosaurus/CVCL_DP48',
 '/cellosaurus/CVCL_0062',
 '/cellosaurus/CVCL_4Y38',
 '/cellosaurus/CVCL_4Y39']

In [28]:
info_part1 = []
for count, cvcl in enumerate(unique_links[:5024]):
    print(f"link: {count}")
    info_element = get_info(base_url, cvcl)
    if info_element:
        info_part1.append(info_element)

info_part2 = []
for count, cvcl in enumerate(unique_links[5024:]):
    print(f"link: {count}")
    info_element = get_info(base_url, cvcl)
    if info_element:
        info_part2.append(info_element)

link: 0
link: 1
Name: JKT-1
link: 2
Name: JKT-HM
	link: /cellosaurus/CVCL_T011
link: 3
Name: SEM-1
link: 4
Name: TCam-2
link: 5
Name: B76.1/Huh7
	link: /cellosaurus/CVCL_U442
link: 6
Name: B76/Huh7
CVCL_U443 (B76.1/Huh7)

	link: /cellosaurus/CVCL_0336
link: 7
Name: HuH-7-END
	link: /cellosaurus/CVCL_0336
link: 8
Name: Huh-7-Lunet
CVCL_CW91 (Huh-7-Lunet/Con1)

	link: /cellosaurus/CVCL_0336
link: 9
Name: Huh-7-Lunet/Con1
	link: /cellosaurus/CVCL_U459
link: 10
Name: Huh-7.5
	1 PXD entries for Huh-7.5
CVCL_A0TI (Huh-7.5 Tet-On)CVCL_E049 (Huh-7.5.1)

	link: /cellosaurus/CVCL_0336
link: 11
Name: Huh-7.5.1
CVCL_YU21 (Huh7.5.1-5)CVCL_YU20 (Huh7.5.1-8)

	link: /cellosaurus/CVCL_7927
link: 12
Name: Huh7 IFITM2-/-
	link: /cellosaurus/CVCL_0336
link: 13
Name: HUH7-ins
CVCL_HA63 (Melligen)

	link: /cellosaurus/CVCL_0336
link: 14
Name: Huh7.5.1-5
	link: /cellosaurus/CVCL_E049
link: 15
Name: Huh7.5.1-8
	link: /cellosaurus/CVCL_E049
link: 16
Name: Melligen
	link: /cellosaurus/CVCL_HA62
link: 17
Name: 

In [41]:
info = info_part1 + info_part2
df = dataframe_converter(info)
df.to_csv("cellosaurus_webscraping.csv", sep = ";",index = False)
df.head()

Unnamed: 0,Cell_name,PXD_list,parent
0,2LMP,False,/cellosaurus/CVCL_0062
1,B02/GFP.2,False,/cellosaurus/CVCL_5T76
2,Ecad-231-7,False,/cellosaurus/CVCL_0062
3,Ecad-231-9,False,/cellosaurus/CVCL_0062
4,LINTERNA MDA-MB-231,False,/cellosaurus/CVCL_0062


In [40]:
projects = []
PXDs = list(df.loc[df.PXD_list != False].PXD_list)
for i in PXDs:
    for x in i:
        projects.append(x)
len(projects)

407

In [47]:
len(df.parent.unique())

541

The resulting PXD-projects were manually checked for eligibility for this project. <br>
Criteria include: (i) LFQ, (ii) no enrichment procedure, (iii) HCD-fragentation.

The resulting file is located in the *Metadata* folder and is titled *cellosaurus_webscraping_filtered_update.csv*