In [None]:
import GEOparse
import pandas as pd
import numpy as np

In [None]:
import pylab as pl
import seaborn as sns
pl.rcParams['figure.figsize'] = (14, 10)
pl.rcParams['ytick.labelsize'] = 12
pl.rcParams['xtick.labelsize'] = 11
pl.rcParams['axes.labelsize'] = 23
pl.rcParams['legend.fontsize'] = 20
sns.set_style('ticks')
c1, c2, c3, c4 = sns.color_palette("Set1", 4)

In [None]:
gse = GEOparse.get_GEO("GSE84422")

In [None]:
gse.metadata['title']

In [None]:
gse.metadata['summary']

In [None]:
gse.metadata['sample_id']

In [None]:
for gsm_name, gsm in gse.gsms.items():
    print("Name: ", gsm_name)
    print("Metadata:",)
    for key, value in gsm.metadata.items():
        print(" - %s : %s" % (key, ", ".join(value)))
    print ("Table data:",)
    print (gsm.table.head())
    break

In [None]:
pivoted_samples = gse.pivot_samples('VALUE')

In [None]:
pivoted_samples.head()

In [None]:
pivoted_samples_average = pivoted_samples.median(axis=1)
print("Number of probes before filtering: ", len(pivoted_samples_average))

In [None]:
expression_threshold = pivoted_samples_average.quantile(0.25)

In [None]:
expressed_probes = pivoted_samples_average[pivoted_samples_average >= expression_threshold].index.tolist()
print("Number of probes above threshold: ", len(expressed_probes))

In [None]:
samples = gse.pivot_samples("VALUE").ix[expressed_probes]

In [None]:
samples.head()

In [None]:
print (gse.phenotype_data[["title", "source_name_ch1" , "characteristics_ch1.8.neuropathological category"]])

In [None]:
experiments = {}
for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()):
    tmp = {}
    tmp["Experiment"] = idx
    tmp["Type"] = row["characteristics_ch1.8.neuropathological category"] 
    experiments[i] = tmp
experiments = pd.DataFrame(experiments).T
print (experiments)

In [None]:
for time, group in experiments.groupby("Type"):
    print(time)
    transfection_name = group[group.Type == "definite AD"]

In [None]:
AD_sample = samples[transfection_name.Experiment]

In [None]:
AD_sample_annotated = AD_sample.reset_index().merge(gse.gpls['GPL570'].table[["ID", "ENTREZ_GENE_ID"]],
                                left_on='ID_REF', right_on="ID", how='inner')


In [None]:
AD_sample_annotated = AD_sample_annotated.dropna(subset=["ENTREZ_GENE_ID"])
# remove probes with more than one gene assigned
AD_sample_annotated = AD_sample_annotated[~AD_sample_annotated.ENTREZ_GENE_ID.str.contains("///")]
# for each gene average LFC over probes
AD_sample_annotated = AD_sample_annotated.groupby("ENTREZ_GENE_ID").median()

In [None]:
AD_sample_annotated.head()

In [None]:
AD_sample_annotated.tail()

In [None]:
AD_sample_annotated.shape

In [None]:
AD_sample_annotated.fillna(1)

In [None]:
AD_sample_annotated.to_csv('AD_data_GEO.csv')