In [2]:
import pandas as ps
import glob


In [62]:
annos = ps.read_csv("HG-U133_Plus_2.na36.annot.tabs.csv", sep='\t', dtype='str', skiprows=25)

In [63]:
annos['Alignments']

0            chr6:30856165-30867931 (+) // 95.63 // p21.33
1            chr7:73646002-73668732 (-) // 70.86 // q11.23
2        chr1:161494448-161496380 (+) // 99.59 // q23.3...
3              chr2:113974939-114036488 (-) // 98.3 // q13
4             chr6:42140941-42147792 (+) // 90.58 // p21.1
                               ...                        
54670                                                  ---
54671                                                  ---
54672                                                  ---
54673                                                  ---
54674                                                  ---
Name: Alignments, Length: 54675, dtype: object

In [73]:
probByRegion = annos.loc[:,['Probe Set ID','Genome Version', 'Alignments']]
probByRegion['strand'] = probByRegion['Alignments'].str.extract('\\((.)\\)', expand=True)
probByRegion.loc[:, 'Alignments'].replace('^([^ ]+).+', '\\1', regex=True, inplace=True)
probByRegion['chromo'] = probByRegion['Alignments'].str.extract('^(.+):', expand=True)
probByRegion['start'] = probByRegion['Alignments'].str.extract('^.+:(.+)-', expand=True)
probByRegion['end'] = probByRegion['Alignments'].str.extract('^.+:.+-(.+)$', expand=True)
probByRegion['assembly'] = probByRegion['Genome Version'].str.extract('(GRCh..)', expand=True)


In [74]:
probByRegion['strand'].unique()


array(['+', '-', nan], dtype=object)

In [75]:
probByRegion.head

<bound method NDFrame.head of           Probe Set ID                                     Genome Version  \
0            1007_s_at  February 2009 (Genome Reference Consortium GRC...   
1              1053_at  February 2009 (Genome Reference Consortium GRC...   
2               117_at  February 2009 (Genome Reference Consortium GRC...   
3               121_at  February 2009 (Genome Reference Consortium GRC...   
4            1255_g_at  February 2009 (Genome Reference Consortium GRC...   
...                ...                                                ...   
54670   AFFX-ThrX-5_at  February 2009 (Genome Reference Consortium GRC...   
54671   AFFX-ThrX-M_at  February 2009 (Genome Reference Consortium GRC...   
54672  AFFX-TrpnX-3_at  February 2009 (Genome Reference Consortium GRC...   
54673  AFFX-TrpnX-5_at  February 2009 (Genome Reference Consortium GRC...   
54674  AFFX-TrpnX-M_at  February 2009 (Genome Reference Consortium GRC...   

                     Alignments strand chromo

In [76]:
def lookupGenomeCoordinate(row):
    matchingIndex = probByRegion['Probe Set ID'] == row['affy_hgea_probe_id']
    matchingRow = probByRegion[matchingIndex]
    if len(matchingRow) is not 1:
        print("row error! len is {}".format(len(matchingRow)))
        print(row)
    else:
        return matchingRow['chromo'].iloc[0], matchingRow['start'].iloc[0], matchingRow['end'].iloc[0], matchingRow['assembly'].iloc[0], matchingRow['strand'].iloc[0]

In [78]:
example = ps.DataFrame()
for filename in glob.glob("/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/*.tsv"):
    print(filename)
    expressDf = ps.read_csv(filename,sep='\t')
    probeLocations = expressDf.apply(lambda x: lookupGenomeCoordinate(x), result_type='expand', axis=1) 
    (expressDf
         .assign(chromosome=probeLocations[0])
         .assign(seq_start_position=probeLocations[1])
         .assign(seq_end_position=probeLocations[2])
         .assign(genome_assembly=probeLocations[3])
         .assign(strand=probeLocations[4])
         .to_csv("{}_updated.tsv".format(filename), sep='\t', index=False))
     
    


/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1783.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2498.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1183.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2234.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-3079.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1861.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expressio

/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2770.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-605.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-529.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-923.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2395.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1791.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_C

/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2273.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-276.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2436.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-274.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1788.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2938.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_

/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-586.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1848.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1982.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-698.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1118.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2121.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_

/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2775.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-174.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2264.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1034.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1998.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2499.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression

/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-117.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-592.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1066.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2046.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-3011.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1649.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_

/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2248.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1872.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2036.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-677.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2175.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1176.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression

/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1859.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-672.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2172.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-2053.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1114.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression_CRL-1829.tsv
/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/CRL_expression

In [32]:
for filename in glob.glob("/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data_quarantine/UPDOG/CRL/expression/microarray/*_updated.tsv"):
    expressDf = ps.DataFrame()
    rootName = filename.replace('_updated.tsv', "")
    expressDf = (ps.read_csv(filename,sep='\t')
                .iloc[: ,0:24]
                .rename(columns={'Platform' : 'platform'})
                 )
    expressDf.insert(22, 'z_score', '')
    expressDf.to_csv(rootName, sep='\t', index=False)
    
    
    

In [74]:
for filename in glob.glob("/home/afollette/Finder_Data_Repositories/pdxfinder-data-release-head/data/UPDOG/CRL/expression/microarray/*tsv"):
    expressDf = ps.DataFrame()
    expressDf = (ps.read_csv(filename,sep='\t')
                 .rename(columns={'hgnc_symbol' : 'symbol'})
                 )
    withoutHGNC = expressDf[expressDf['symbol'].isna()].index
    expressDf.drop(withoutHGNC,inplace=True)
    expressDf.to_csv(filename, sep='\t', index=False)
   
    

In [175]:
def aggregateAllHUGOsymbols(row):
    chromo = row['chr']
    lowerBound = row['start']
    upperBound = row['end']
    chromoMatch = symbolLookup['chr'] == chromo
    lowerLimit = symbolLookup['start'] >= lowerBound
    upperLimit = symbolLookup['end'] <= upperBound
    regionMatches = symbolLookup[chromoMatch & lowerLimit & upperLimit]
    if len(regionMatches['symbol'].values) > 0:
        row['symbol'] = list(set(regionMatches['symbol'].values))
        ps.DataFrame(row).transpose().to_csv('results.tsv' ,mode='a', header=False, sep='\t', index=False)

In [18]:
results = probByRegion.apply(lambda x: aggregateAllHUGOsymbols(x), axis=1)