In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nibabel as nib
import pydicom
from tqdm.notebook import tqdm_notebook
import datetime
from datetime import datetime


from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
def dicom2numpy(dicom_path):
    dicom_file = pydicom.read_file(dicom_path)
    return np.array(dicom_file.pixel_array), dicom_file.PhotometricInterpretation,dicom_file


# Statistical Analysis (Only DICOM Images)

## Negatives

In [None]:
import os

# Establishing the current work directory (cwd)
thisdir = '/media/ia/DATA/COVID19/Negativas XNAT/p0032021/'
Files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(thisdir):
    for file in f:
        if ".dcm" in file:
            Files.append(os.path.join(r, file))
filenames = Files

In [None]:
print(len(next(os.walk('/media/ia/DATA/COVID19/Negativas XNAT/p0032021/'))[1])) #Number of folders

In [None]:
filenames

In [None]:
len(filenames) #Number of files

In [None]:

#Get all Statistical Variables
import datetime
from datetime import datetime
dates=[]
Sex=[]
Ages=[]
shapes=[]
Manufacturers=[]
Models=[]
Man_1_filenames=[]
Man_2_filenames=[]
for f in tqdm_notebook(filenames):
    img,_,dicom=dicom2numpy(f)
    datetime_obj = datetime.strptime(dicom.StudyDate,
                                 "%Y%m%d")
    shapes.append(img.shape)
    try:
        Manufacturers.append(dicom.Manufacturer) #Brand of X-ray equipment
        Models.append(dicom.ManufacturerModelName)
        
        if dicom.Manufacturer=='GE Healthcare':
            Man_1_filenames.append(f)
        elif dicom.Manufacturer=='Agfa':
            Man_2_filenames.append(f)
    except:
        continue
    
    try:
        datetime_birth=datetime.strptime(dicom.PatientBirthDate,
                                 "%Y%m%d")
        if int((datetime_obj-datetime_birth).days/365)>0:
            Ages.append(int((datetime_obj-datetime_birth).days/365))
    except:
        continue

    dates.append(datetime_obj.date()) #Study Dates
    Sex.append(dicom.PatientSex) #Patients Sex
    

In [None]:
min(dates) #min date of Study

In [None]:
max(dates) #max day of Study

### Manufacturer

In [None]:
pd.DataFrame(Manufacturers)[0].hist(bins=10, xlabelsize=12)

plt.grid(axis='x')
plt.xlabel('Years')
plt.ylabel('Count')
#plt.title('Years Distribution of collected Images')
#plt.savefig("/media/ia/DATA/COVID19/images_paper/StudyDate_COVID_Negative.pdf",bbox_inches='tight')

plt.show()

In [None]:
pd.DataFrame(Manufacturers)[0].value_counts()

### Age

In [None]:
pd.DataFrame(Ages)[0].value_counts()

In [None]:
plt.figure()

plt.hist(pd.DataFrame(Ages)[0])

plt.grid(axis='y')
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution of collected Images')
plt.savefig("/media/ia/DATA/COVID19/images_paper/Age_COVID_Negative.pdf",bbox_inches='tight')
plt.show()


In [None]:
b1,e1=np.histogram(Ages, bins=[0,10,20,30,40,50,60,70,80,90,100])

In [None]:
b1

In [None]:
e1

### Study Date

In [None]:
pd.DataFrame(dates)[0].value_counts()

In [None]:
pd.DataFrame(dates)[0].hist(bins=10, xlabelsize=12)

plt.grid(axis='x')
plt.xlabel('Years')
plt.ylabel('Count')
#plt.title('Years Distribution of collected Images')
plt.savefig("/media/ia/DATA/COVID19/images_paper/StudyDate_COVID_Negative.pdf",bbox_inches='tight')

plt.show()

In [None]:
#Grafica más estetica

dates_df=pd.DataFrame(dates)

In [None]:
dates_df

In [None]:
df=pd.DataFrame(columns=['Date'])

In [None]:
df

In [None]:
to_timestamp = np.vectorize(lambda x: x.timestamp())

In [None]:
import datetime
import numpy as np

to_timestamp = np.vectorize(lambda x: (x - datetime.date(1970, 1, 1)).total_seconds())
from_timestamp = np.vectorize(lambda x: datetime.datetime.utcfromtimestamp(x))

## Compute the histogram
hist, bin_edges = np.histogram(to_timestamp(dates))

In [None]:
hist

In [None]:
from_timestamp(bin_edges)

In [None]:
bins = 0.5 * (bin_edges[:-1] + bin_edges[1:])

In [None]:
bins

In [None]:
import plotly.express as px

fig = px.bar(x=from_timestamp(bins), y=hist, labels={'x':'Date', 'y':'Count'},width=900)
fig.data[0].text = hist
fig.update_traces(textposition='outside', textfont_size=10)
fig.update_layout(bargap=0)
fig.show()
fig.write_image("/media/ia/DATA/COVID19/images_paper/StudyDate_COVID_Negative_1.pdf",width=900)


### Sex

In [None]:
counts

In [None]:
labels, counts = np.unique(Sex, return_counts=True)
plt.bar(labels, counts, align='center')
plt.gca().set_xticks(labels)
plt.xlabel('Sex')
plt.ylabel('Count')
plt.grid(axis='y')
#plt.title('Sex Distribution of Collected Images')
plt.savefig("/media/ia/DATA/COVID19/images_paper/Sex_COVID_Negative.pdf",bbox_inches='tight')
plt.show()

## Plot combining Age and Sex


In [None]:

Age_010=[]
Age_1020=[]
Age_2030=[]
Age_3040=[]
Age_4050=[]
Age_5060=[]
Age_6070=[]
Age_7080=[]
Age_8090=[]
Age_90100=[]

Sex_010=[]
Sex_1020=[]
Sex_2030=[]
Sex_3040=[]
Sex_4050=[]
Sex_5060=[]
Sex_6070=[]
Sex_7080=[]
Sex_8090=[]
Sex_90100=[]



for f in tqdm_notebook(filenames):
    img,_,dicom=dicom2numpy(f)
    datetime_obj = datetime.strptime(dicom.StudyDate,
                                 "%Y%m%d")
    try:
        datetime_birth=datetime.strptime(dicom.PatientBirthDate,
                                 "%Y%m%d")
        age=int((datetime_obj-datetime_birth).days/365)
        
        if age>0 and age<10:
            Age_010.append(age)
            Sex_010.append(dicom.PatientSex)
        elif age>=10 and age<20:
            Age_1020.append(age)
            Sex_1020.append(dicom.PatientSex)
        elif age>=20 and age<30:
            Age_2030.append(age)
            Sex_2030.append(dicom.PatientSex)
        elif age>=30 and age<40:
            Age_3040.append(age)
            Sex_3040.append(dicom.PatientSex)
        elif age>=40 and age<50:
            Age_4050.append(age)
            Sex_4050.append(dicom.PatientSex)
        elif age>=50 and age<60:
            Age_5060.append(age)
            Sex_5060.append(dicom.PatientSex)
        elif age>=60 and age<70:
            Age_6070.append(age)
            Sex_6070.append(dicom.PatientSex)
        elif age>=70 and age<80:
            Age_7080.append(age)
            Sex_7080.append(dicom.PatientSex)
        elif age>=80 and age<90:
            Age_8090.append(age)
            Sex_8090.append(dicom.PatientSex)
        elif age>=90 and age<=100:
            Age_90100.append(age)
            Sex_90100.append(dicom.PatientSex)
    except:
        continue

In [None]:
df_SA=pd.DataFrame(columns=['Age','Sex'])

In [None]:
df_SA['Sex']=[*Sex_010,*Sex_1020,*Sex_2030,*Sex_3040,*Sex_4050,*Sex_5060,*Sex_6070,*Sex_7080,*Sex_8090,*Sex_90100]
df_SA['Age']=[*Age_010,*Age_1020,*Age_2030,*Age_3040,*Age_4050,*Age_5060,*Age_6070,*Age_7080,*Age_8090,*Age_90100]

In [None]:
for i in range(len(df_SA)):
    if df_SA['Sex'][i]=='M':
        df_SA['Sex'][i]=df_SA['Sex'][i]+' (68.68%)'
    else:
        df_SA['Sex'][i]=df_SA['Sex'][i]+' (31.32%)'

In [None]:
fig = px.histogram(df_SA, y="Age", color="Sex",nbins=10)#.update_xaxes(categoryorder='total descending')
fig.show()
fig.write_image("/media/ia/DATA/COVID19/images_paper/AgeSex_COVID_Negative_1.pdf",width=900)

In [None]:
count_Age, bin_edges = np.histogram(df_SA['Age'],bins=[0,10,20,30,40,50,60,70,80,90,100])

In [None]:
count_Age,bin_edges

In [None]:
labels, counts = np.unique(Sex_010, return_counts=True)

In [None]:
df_counts=pd.DataFrame(columns=['Age','Sex','Count'])

## Positives

In [None]:
import os

# Establishing the current work directory (cwd)
thisdir ="/media/ia/DATA/COVID19/Xnat_positivas/"
Files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(thisdir):
    for file in f:
        if ".dcm" in file:
            Files.append(os.path.join(r, file))
filenames = Files

In [None]:
print(len(next(os.walk("/media/ia/DATA/COVID19/Xnat_positivas/"))[1]))

In [None]:
filenames

In [None]:
len(filenames)

In [None]:
import datetime
from datetime import datetime
dates=[]
Sex=[]
Ages_1=[]
shapes=[]
Manufacturers=[]
Models=[]
for f in tqdm_notebook(filenames):
    img,_,dicom=dicom2numpy(f)
    datetime_obj = datetime.strptime(dicom.StudyDate,
                                 "%Y%m%d")
    shapes.append(img.shape)
    try:
        Manufacturers.append(dicom.Manufacturer)
        Models.append(dicom.ManufacturerModelName)
    except:
        continue

    try:
        datetime_birth=datetime.strptime(dicom.PatientBirthDate,
                                 "%Y%m%d")
        Ages_1.append(int((datetime_obj-datetime_birth).days/365))
    except:
        continue
    dates.append(datetime_obj.date())
    Sex.append(dicom.PatientSex)

In [None]:
min(shapes)

In [None]:
max(shapes)

In [None]:
min(dates)

In [None]:
max(dates)

### Manufacturer

In [None]:
pd.DataFrame(Manufacturers)[0].hist(bins=10, xlabelsize=12)

plt.grid(axis='x')
plt.xlabel('Years')
plt.ylabel('Count')
#plt.title('Years Distribution of collected Images')
#plt.savefig("/media/ia/DATA/COVID19/images_paper/StudyDate_COVID_Negative.pdf",bbox_inches='tight')

plt.show()

In [None]:
pd.DataFrame(Manufacturers)[0].value_counts()

### Age

In [None]:
pd.DataFrame(Ages_1)[0].value_counts()

In [None]:
pd.DataFrame(Ages_1)[0].hist(bins=10, xlabelsize=12)

plt.grid(axis='x')
plt.xlabel('Age')
plt.ylabel('Count')
#plt.title('Age Distribution of collected Images')
plt.savefig("/media/ia/DATA/COVID19/images_paper/Age_COVID_Positive.pdf",bbox_inches='tight')
plt.show()

In [None]:
b1,e1=np.histogram(Ages_1, bins=[0,10,20,30,40,50,60,70,80,90,100])

In [None]:
b1

In [None]:
e1

### Study Date

In [None]:
pd.DataFrame(dates)[0].value_counts()

In [None]:
pd.DataFrame(dates)[0].hist(bins=10, xlabelsize=10)

plt.grid(axis='x')
plt.xlabel('Years')
plt.ylabel('Count')
#plt.title('Dates of collected Images')
plt.savefig("/media/ia/DATA/COVID19/images_paper/StudyDate_COVID_Positive.pdf",bbox_inches='tight')
plt.show()

In [None]:
import datetime
import numpy as np

to_timestamp = np.vectorize(lambda x: (x - datetime.date(1970, 1, 1)).total_seconds())
from_timestamp = np.vectorize(lambda x: datetime.datetime.utcfromtimestamp(x))

## Compute the histogram
hist, bin_edges = np.histogram(to_timestamp(dates))

In [None]:
hist

In [None]:
from_timestamp(bin_edges)

In [None]:
bins =  0.5*(bin_edges[:-1] + bin_edges[1:])

In [None]:
import plotly.express as px

fig = px.bar(x=from_timestamp(bins), y=hist, labels={'x':'Date', 'y':'Count'},width=900)
fig.data[0].text = hist

fig.update_traces(textposition='outside', textfont_size=8)
fig.update_layout(bargap=0)
fig.show()
fig.write_image("/media/ia/DATA/COVID19/images_paper/StudyDate_COVID_Positive_1.pdf",width=900)

### Sex

In [None]:
labels, counts = np.unique(Sex, return_counts=True)
plt.bar(labels, counts, align='center')
plt.gca().set_xticks(labels)

plt.grid(axis='y')
#plt.title('Sex Distribution of Collected Images')
plt.savefig("/media/ia/DATA/COVID19/images_paper/Sex_COVID_Positive.pdf",bbox_inches='tight')
plt.show()

In [None]:
counts

## Plot combining Age and Sex

In [None]:
Age_010=[]
Age_1020=[]
Age_2030=[]
Age_3040=[]
Age_4050=[]
Age_5060=[]
Age_6070=[]
Age_7080=[]
Age_8090=[]
Age_90100=[]

Sex_010=[]
Sex_1020=[]
Sex_2030=[]
Sex_3040=[]
Sex_4050=[]
Sex_5060=[]
Sex_6070=[]
Sex_7080=[]
Sex_8090=[]
Sex_90100=[]



for f in tqdm_notebook(filenames):
    img,_,dicom=dicom2numpy(f)
    datetime_obj = datetime.strptime(dicom.StudyDate,
                                 "%Y%m%d")
    try:
        datetime_birth=datetime.strptime(dicom.PatientBirthDate,
                                 "%Y%m%d")
        age=int((datetime_obj-datetime_birth).days/365)
        
        if age>0 and age<10:
            Age_010.append(age)
            Sex_010.append(dicom.PatientSex)
        elif age>=10 and age<20:
            Age_1020.append(age)
            Sex_1020.append(dicom.PatientSex)
        elif age>=20 and age<30:
            Age_2030.append(age)
            Sex_2030.append(dicom.PatientSex)
        elif age>=30 and age<40:
            Age_3040.append(age)
            Sex_3040.append(dicom.PatientSex)
        elif age>=40 and age<50:
            Age_4050.append(age)
            Sex_4050.append(dicom.PatientSex)
        elif age>=50 and age<60:
            Age_5060.append(age)
            Sex_5060.append(dicom.PatientSex)
        elif age>=60 and age<70:
            Age_6070.append(age)
            Sex_6070.append(dicom.PatientSex)
        elif age>=70 and age<80:
            Age_7080.append(age)
            Sex_7080.append(dicom.PatientSex)
        elif age>=80 and age<90:
            Age_8090.append(age)
            Sex_8090.append(dicom.PatientSex)
        elif age>=90 and age<=100:
            Age_90100.append(age)
            Sex_90100.append(dicom.PatientSex)
    except:
        continue

In [None]:
df_SA=pd.DataFrame(columns=['Age','Sex'])

In [None]:
df_SA['Sex']=[*Sex_010,*Sex_1020,*Sex_2030,*Sex_3040,*Sex_4050,*Sex_5060,*Sex_6070,*Sex_7080,*Sex_8090,*Sex_90100]
df_SA['Age']=[*Age_010,*Age_1020,*Age_2030,*Age_3040,*Age_4050,*Age_5060,*Age_6070,*Age_7080,*Age_8090,*Age_90100]

In [None]:
for i in range(len(df_SA)):
    if df_SA['Sex'][i]=='M':
        df_SA['Sex'][i]=df_SA['Sex'][i]+' (52.12%)'
    else:
        df_SA['Sex'][i]=df_SA['Sex'][i]+' (47.88%)'

In [None]:
fig = px.histogram(df_SA, y="Age", color="Sex",nbins=10)#.update_xaxes(categoryorder='total descending')
fig.show()
fig.write_image("/media/ia/DATA/COVID19/images_paper/AgeSex_COVID_Positive_1.pdf",width=900)