# Title

Introduction...

## 1. Import libraries

More talk...

In [19]:
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
import os
import shutil

## 2. Image pre-processing

In this section, we ...

In [25]:
# create temp folder to store the images
os.mkdir("data/plots_pdf/temp")

In [54]:
# convert pdf files to images 

directory = "data/plots_pdf"
destination = "data/plots_pdf/temp"

for filename in os.listdir(directory):
    if filename.endswith(".pdf"): 
        
        # copy to temp folder
        shutil.copy(directory + '/' + filename, destination + '/' + filename)
        
        # convert pdf file to 23 images
        images = convert_from_path(directory + '/' + filename, 500)
        for i, image in enumerate(images):
            fname = destination + '/' + filename[:-4] + str(i) + '.png'
            image.save(fname, "PNG")            
            
        # create a new blank image
        new_image = Image.new('RGB',(4*1500, 6*500), (250,250,250))
        
        # create counters for the new image grid
        row = 0
        col = 0

        # read the 23 images, resize, and merge them
        image_arr = []*23
        for i in range(23):
            image_arr.append(Image.open(destination + '/' + filename[:-4] + str(i) + '.png'))
            image_arr[i] = image_arr[i].resize((1500, 500))
            new_image.paste(image_arr[i], (row*1500, col))
            row += 1
            if row == 4:
                row = 0
                col += 500
    
        new_image.save("images/" + filename[:-4] + "_merged_image.png","PNG")
        
        # remove from temp folder
        for i in range(23):
            os.remove(destination + '/' + filename[:-4] + str(i) + '.png')
        
        os.remove(destination + '/' + filename)

## 3. Exploratory Data Analysis

In this section, ...

In [83]:
# load chromothripsis dataset
df = pd.read_csv('data/chromothripsis.txt', sep='\t')
df.head()

Unnamed: 0,sample,pcawg,chrom_involved
1,DO221123,,
2,DO221124,,
3,DO221127,,
4,DO221129,,
5,DO221548,,


In [84]:
df.shape

(255, 3)

Compare between the number of rows in the dataset and the number of images.

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255 entries, 1 to 255
Data columns (total 3 columns):
sample            255 non-null object
pcawg             35 non-null object
chrom_involved    34 non-null object
dtypes: object(3)
memory usage: 8.0+ KB


In [86]:
df.describe()

Unnamed: 0,sample,pcawg,chrom_involved
count,255,35,34
unique,233,1,31
top,DO52677,chromothripsis,19_19
freq,4,35,2


We have 36 chromothripsis cases with one case missing the chrom_involved.
233 unique patient.

I didn't do missing value analysis, because I assume the NaN values represent not sick patients. I just replaced the NaN values with zeros.

In [87]:
df = df.fillna(0)

In [88]:
df['pcawg'] = df['pcawg'].replace('chromothripsis', 1)

In [89]:
df.head(100)

Unnamed: 0,sample,pcawg,chrom_involved
1,DO221123,0,0
2,DO221124,0,0
3,DO221127,0,0
4,DO221129,0,0
5,DO221548,0,0
6,DO222303,0,0
7,DO222304,0,0
8,DO222306,0,0
9,DO222307,0,0
10,DO222308,0,0


In [118]:
# load images dataset
images = pd.read_csv('data/images.csv')
images.head()

Unnamed: 0,sample,Primary_tumour
0,DO221123,lymph node
1,DO221124,lymph node
2,DO221127,lymph node
3,DO221129,lymph node
4,DO221548,blood derived (bone marrow)


In [119]:
images.shape

(244, 2)

In [120]:
images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 2 columns):
sample            244 non-null object
Primary_tumour    244 non-null object
dtypes: object(2)
memory usage: 3.9+ KB


In [121]:
images.describe()

Unnamed: 0,sample,Primary_tumour
count,244,244
unique,233,4
top,DO52752,blood derived (peripheral blood)
freq,2,122


In [122]:
pd.get_dummies(images['Primary_tumour'])

Unnamed: 0,blood derived (bone marrow),blood derived (peripheral blood),lymph node,solid tissue
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,1,0,0,0
5,0,0,0,1
6,0,0,0,1
7,0,0,0,1
8,0,0,0,1
9,0,0,0,1


Change the values to get 4 dummy variables instead of 3.

In [123]:
images = pd.concat([images.drop('Primary_tumour', axis=1), pd.get_dummies(images['Primary_tumour'], prefix='Primary_tumour', prefix_sep='_', drop_first=False)], axis=1)

In [124]:
images.head()

Unnamed: 0,sample,Primary_tumour_blood derived (bone marrow),Primary_tumour_blood derived (peripheral blood),Primary_tumour_lymph node,Primary_tumour_solid tissue
0,DO221123,0,0,1,0
1,DO221124,0,0,1,0
2,DO221127,0,0,1,0
3,DO221129,0,0,1,0
4,DO221548,1,0,0,0


In [125]:
# combine 2 datasets
df_merge = df.merge(images, how = 'outer', on=['sample'])

In [126]:
df_merge.head()

Unnamed: 0,sample,pcawg,chrom_involved,Primary_tumour_blood derived (bone marrow),Primary_tumour_blood derived (peripheral blood),Primary_tumour_lymph node,Primary_tumour_solid tissue
0,DO221123,0,0,0,0,1,0
1,DO221124,0,0,0,0,1,0
2,DO221127,0,0,0,0,1,0
3,DO221129,0,0,0,0,1,0
4,DO221548,0,0,1,0,0,0


In [127]:
df_merge.shape

(277, 7)

In [128]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 277 entries, 0 to 276
Data columns (total 7 columns):
sample                                             277 non-null object
pcawg                                              277 non-null int64
chrom_involved                                     277 non-null object
Primary_tumour_blood derived (bone marrow)         277 non-null uint8
Primary_tumour_blood derived (peripheral blood)    277 non-null uint8
Primary_tumour_lymph node                          277 non-null uint8
Primary_tumour_solid tissue                        277 non-null uint8
dtypes: int64(1), object(2), uint8(4)
memory usage: 9.7+ KB


In [129]:
df_merge.describe()

Unnamed: 0,pcawg,Primary_tumour_blood derived (bone marrow),Primary_tumour_blood derived (peripheral blood),Primary_tumour_lymph node,Primary_tumour_solid tissue
count,277.0,277.0,277.0,277.0,277.0
mean,0.133574,0.119134,0.480144,0.379061,0.021661
std,0.34081,0.324532,0.50051,0.486032,0.145836
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [131]:
df_merge['sample'].describe()

count         277
unique        233
top       DO52729
freq            4
Name: sample, dtype: object

In [132]:
image['sample'].describe()

TypeError: 'PpmImageFile' object is not subscriptable