This notebook contains code that can be used to download the raw single cell Hi-C data  

The GEO source
    https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE130711

An alternative/easier way to access the data (instead of doing all this) is to download data.txt from this Google Drive  
    https://drive.google.com/drive/folders/1SuzqQ_9dliAmTb-fGprFnN3aZrfWS-Fg

In [None]:
# import packages
from urllib.request import urlretrieve
import gzip
import time

In [None]:
# read in meta data for the experiment 
# (file names and geo accession n for all 4238 samples)

chunk_size = 4096
# convert from gzip to txt
with gzip.open('GSE130711-GPL20301_series_matrix.txt.gz', 'rb') as f_in:
  with open('matrix1.txt', 'wb') as f_out:
    chunk = f_in.read(chunk_size)
    while chunk:
      f_out.write(chunk)
      chunk = f_in.read(chunk_size) 
        
with gzip.open('GSE130711-GPL24676_series_matrix.txt.gz', 'rb') as f_in:
  with open('matrix2.txt', 'wb') as f_out:
    chunk = f_in.read(chunk_size)
    while chunk:
      f_out.write(chunk)
      chunk = f_in.read(chunk_size) 

# read in txt files
with open("matrix1.txt", mode="r") as file:
    matrix1 = file.readlines()
with open("matrix2.txt", mode="r") as file:
    matrix2 = file.readlines()

In [None]:
# extract ftp links and geo accession numbers
for item in matrix1:    
    if item.startswith("!Sample_geo_accession"):
        geo1 = item.replace('"','').split()[1:]
    elif item.startswith("!Sample_supplementary_file_1"):
        ftp1 = item.replace('"','').split()[1:]

for item in matrix2:    
    if item.startswith("!Sample_geo_accession"):
        geo2 = item.replace('"','').split()[1:]
    elif item.startswith("!Sample_supplementary_file_1"):
        ftp2 = item.replace('"','').split()[1:]

geo = geo1 + geo2
ftp = ftp1 + ftp2
print(len(geo))
print(len(ftp))

In [None]:
# download the files and then use 7zip to open after
zipfilenames = ['zipped/' + geos.strip() + '.gz' for geos in geo]

def download_zip(min, max):
    print("Downloading zip file...")
    
    for i in range(min,max):
        try:
            print("  %s of %s" % (i+1, max))
            urlretrieve(ftp[i],zipfilenames[i])

In [None]:
# 1-500
download_zip(0,500)

In [None]:
# 501-1000
download_zip(500,1000)

In [None]:
# 1001-1500
download_zip(1000,1500)

In [None]:
# 1501-2000
download_zip(1500,2000)

In [None]:
# 2001-2500
download_zip(2000,2500)

In [None]:
# 2501-3000
download_zip(2500,3000)

In [None]:
# 3001-3500
download_zip(3000,3500)

In [None]:
# 3501-4000
download_zip(3500,4000)

In [None]:
# 4000-4238
download_zip(4000,4238)

In [None]:
import shutil
import os

In [None]:
# create dict with file names and geo accession numbers
geodict = {}

for i in range(len(geo)):
    longftp = ftp[i]
    geoacst = geo[i]+'_'
    x = longftp.index(geoacst)
    longftp = longftp[x+len(geoacst):-3]
    geodict[geo[i]] = longftp

In [None]:
# move all files into the same directory and rename
directories2 = []
with open("directories2.txt", mode="r") as file:
    for line in file:
        directories2.append(file.readline().replace('\n','').strip())
directories2[1:5]

counter=0
for i in range(len(directories2)):
    g = directories2[i]
    if g=='':
        # do nothing
        x=1
    else:
        x = len(os.listdir('samples/'+g))
        if x>0:
            oldfile = 'samples/'+g+'/'+geodict[g]
            newfile = 'samples/' + g+'.txt'
            shutil.move(oldfile, newfile)
            counter = counter + 1
print(counter)

In [None]:
# move all files into the same directory and rename
directories3 = []
with open("directories3.txt", mode="r") as file:
    for line in file:
        directories3.append(line.replace('\n','').strip())
print(len(directories3))

counter=0
for i in range(len(directories3)):
    g = directories3[i]
    if g=='':
        x=1
    else:
        x = len(os.listdir('samples/'+g))
        if x>0:
            oldfile = 'samples/'+g+'/'+geodict[g]
            newfile = 'samples/' + g+'.txt'
            shutil.move(oldfile, newfile)
            counter = counter + 1
print(counter)

In [None]:
# read all the txt files and ensure there are no data issues
# should contain:
# col1: cell_id
# col2: chrom1
# col3: chrom1 position
# col4: chrom2
# col5: chrom2 position
chrom = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 
         'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
         'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
         'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 
         'chr21', 'chr22']

geo_file_missing = []
geo_file_present = []

for i in range(len(geo)):
    try:
        filename = "samples/" + geo[i].strip() + '.txt'
        print(str(i+1) + " of " + str(len(geo)))
        with open(filename, mode="r") as file:
            ind=0
            for line in file:
                keep=False
                c = line.split()
                if c[1] in chrom and c[3] in chrom and c[2].isdigit() and c[4].isdigit():
                    keep=True 
                    ind=ind+1
            if ind>0:
                geo_file_present.append(geo[i])
            else:
                geo_file_missing.append(geo[i])
    except OSError as err:
        geo_file_missing.append(geo[i])
        print(err)
    except ValueError as err:
        geo_file_missing.append(geo[i])
        print(err)
        print(c)
    except Exception as err:
        geo_file_missing.append(geo[i])
        print(err)
        print(c)

print(paste("Missing:",len(geo_file_missing)))
print(paste("Not Missing:",len(geo_file_present)))

In [None]:
# use the same code to concatenate all the files into one big "data.txt" file
# should contain:
# col1: cell_id
# col2: chrom1
# col3: chrom1 position
# col4: chrom2
# col5: chrom2 position
# col6: count
chrom = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 
         'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 
         'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
         'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 
         'chr21', 'chr22']

geo_file_missing = []
geo_lines = []
geo2 = []

with open('data.txt', 'w') as f_out:
    out = 'cell_id\tchrom1\tpos1\tchrom2\tpos2\tcount\n'
    f_out.write(out)
    id = -1
    for i in range(len(geo)):
        lines=0
        print(str(i+1) + " of " + str(len(geo)))
        try:
            filename = "samples/" + geo[i].strip() + '.txt'
            with open(filename, mode="r") as file:
                id = id+1
                for line in file:
                    try:
                        c = line.split()
                        if c[1] in chrom and c[3] in chrom and c[2].isdigit() and c[4].isdigit():
                            out= str(id) + '\t' + c[1].strip() + '\t' + c[2].strip() + '\t' + c[3].strip() + '\t' + c[2].strip() + '\t1' + '\n'
                            f_out.write(out)
                            lines = lines+1
                    except OSError as err:
                        # skip line
                        x=1
                    except ValueError as err:
                        # skip line
                        x=1
                    except Exception as err:
                        # skip line
                        x=1    
                geo2.append(geo[i])
        except OSError as err:
            geo_file_missing.append(geo[i])
            print(err)
        except ValueError as err:
            geo_file_missing.append(geo[i])
            print(err)
        except Exception as err:
            geo_file_missing.append(geo[i])
            print(err)
            
        geo_lines.append(lines)
            
print("Missing:",str(len(geo_file_missing)))
print("Included:",str(len(geo2)))
print("id is n-1? ", str(id))

In [None]:
import pandas as pd
import pickle
label_info_src = pd.read_pickle("label_info_src.pickle")
label_info_src.keys()

In [None]:
batch = label_info_src['batch id']
cluster_major = label_info_src['major']
cluster_minor = label_info_src['minor']
cluster_major_lab = label_info_src['cluster label']
cluster_minor_lab = label_info_src['cluster label minor']
cell_name = label_info_src["cell_name_higashi"]

In [None]:
# create dict to translate from GEO accession number to sample title
rep = ['"',"_snm3Cseq_hs","_BA10_UMB5577_3_UMB5577",
       "_BA10_UMB5577_1_UMB5577","_BA10_UMB5580_1_UMB5580",
      "_BA10_UMB5577_5_UMB5577","_BA10_UMB5580_3_UMB5580",
      "_BA10_UMB5580_5_UMB5580"]

for item in matrix1:    
    if item.startswith("!Sample_title"):
        title1 = item
        for r in rep:
            title1 = title1.replace(r,"")
        title1 = title1.split()[1:]

for item in matrix2:    
    if item.startswith("!Sample_title"):
        title2 = item
        for r in rep:
            title2 = title2.replace(r,"")
        title2 = title2.split()[1:]
        
title = title1 + title2

for item in title:
    if not(item in cell_name):
        print(item)

geo_to_title = dict(zip(geo,title))

In [None]:
batch_final = []
cluster_major_final = []
cluster_minor_final = []
cluster_major_lab_final = []
cluster_minor_lab_final = []
title_final = []

for geonum in geo2:
    index = cell_name.index(geo_to_title[geonum])
    batch_final.append(batch[index])
    cluster_major_final.append(cluster_major[index])
    cluster_minor_final.append(cluster_minor[index])
    cluster_major_lab_final.append(cluster_major_lab[index])
    cluster_minor_lab_final.append(cluster_minor_lab[index])
    title_final.append(cell_name[index])

print(len(geo))
print(len(batch_final))

In [None]:
# create file with cluster label information
label_info = {'batch id': batch_final,
              'major': cluster_major_final,
              'minor': cluster_minor_final,
              'major label':cluster_major_lab_final,
              'minor label':cluster_minor_lab_final,
              'title':title_final,
              'geo':geo2
             }

pickle.dump(label_info, open("label_info.pickle", "wb"))