In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# import the metadata file and read the first five rows
file_name = 'metadata.csv'
df = pd.read_csv(file_name)
df.head()

In [None]:
# look for columns with null values
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.info()

# Clean and Visualize data


In [None]:
# select the important columns
cols = ['patientid', 'sex', 'age', 'finding', 'RT_PCR_positive', 'survival']
df= df[cols]
df.head()

In [None]:
df.info()

In [None]:
# drop duplicates in patientid
df.drop_duplicates(subset=['patientid'], inplace = True)
df

In [None]:
# read the unique values in finding column
df.finding.unique()

In [None]:
# read finding column and create a new column called finding_res
# finding_res contians 1 for covid while 0 for others
df.loc[df['finding'] == 'Pneumonia/Viral/COVID-19', 'finding_res'] = 1 
df.loc[df['finding'] != 'Pneumonia/Viral/COVID-19', 'finding_res'] = 0 
df

In [None]:
# reorder the columns
df = df[['patientid', 'sex', 'age', 'finding', 'finding_res', 'RT_PCR_positive', 'survival']]
df.head()

In [None]:
df.info()

In [None]:
# fill the NaN values in survival column with Unknown
df["survival"].fillna("Unknown", inplace=True)
df

In [None]:
# fill the NaN values in RT_PCR_positive with Unknown
df["RT_PCR_positive"].fillna("Unknown", inplace=True)
df.head()

In [None]:
df.isnull().sum()

In [None]:
# make a copy of the dataframe
copy = df.copy()
# drop all nan values and plot the distriution of the age
copy.dropna(subset=['age'], inplace=True)
sns.distplot(copy.age, bins=15)

In [None]:
copy = df.copy()
# fill NaN values in age column with mean value
copy.age.fillna(copy["age"].mean(), inplace = True)
#create a subplot with 2 figures to see how the age distribution changed
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
#first the age distribution without nan values and second the nan values replaced with mean value
sns.distplot(df.dropna().age, bins=15, ax=ax[0])
sns.distplot(copy.age, bins=15, ax=ax[1])

In [19]:
# create a function and generate random numbers between (mean - std) and (mean + std) for the NaN values
def age_generator(data):
    age_avg = data['age'].mean()
    age_std = data['age'].std()
    age_null_count = data['age'].isnull().sum()
    age_null_random_list = np.random.randint(
        age_avg - age_std, age_avg + age_std, size=age_null_count)
    data['age'][np.isnan(data['age'])] = age_null_random_list
    return df

In [None]:
# again create a copy of our dataframe and apply the age_generator function to it
copy = df.copy()
copy = age_generator(copy)
# plot again the distribution of the age
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
sns.distplot(df.dropna().age, bins=15, ax=ax[0])
sns.distplot(copy.age, bins=15, ax=ax[1])

In [None]:
# apply the generated random age to df dataframe
df = age_generator(df)

# the age column should be clean now
df.isnull().sum()

In [None]:
# plot of survival column and age column
sns.boxplot(x='survival', y='age', data=df)

The range of the non-survival(N) shows to be high. But this doesn't give a good insight into the covid-19 finding/survival.

In [None]:
# plot of age column and finding (where finding = covid)
age = df['age']
finding_cov = df['finding'] == 'Pneumonia/Viral/COVID-19'
sns.boxplot(x=finding_cov, y=age)


In [None]:
#filtering df to plot first for covid-19 and then for others
sns.distplot(df.loc[df.finding_res == 1, 'age'], color='red', label='covid-19')
sns.distplot(df.loc[df.finding_res == 0, 'age'], color='blue', label='others')
plt.legend()

Both seem to have a close peak. However, finding on covid shows to be high in older age. 

In [None]:
# plot of finding_res against age with surivival column set to hue
sns.barplot(x='finding_res', y='age', data=df, hue='survival')
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

shows the non-survival(N) on both covid and other is more in older age. while younger age show to survive more

In [None]:
# plot of RT_PCR_positive against finding_res with surivival column set to hue
sns.barplot(x='RT_PCR_positive', y='finding_res', data=df, hue='survival')
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

This shows a similar pattern as the above plot. However, the unclear RT_PCR_positive shows to be siginficant which can be a result of no finding, unclear covid rest or other factors. 

In [None]:
# drop the sex column
df.drop(columns=['sex'], inplace=True)
df.isnull().sum()

### REMARK: 
Analysis of the dataset doesn't show one will have higher chance of surviving covid by use of x-ray (as expected given it is just an equipement in use). However, given the repeated need for a covid test, its fastness and detailed orientation makes it a good option of use. Also, considering its sustainability and the inconvience caused by swab testing makes it valid for use.