# Question
## Using the dataset (https://www.kaggle.com/raddar/nodules-in-chest-xrays-jsrt) file name is jsrt_metadata.csv:
1. Find the total number of patients of each disease (diagnosis) and plot in histogram.
2. Find the total number of patients of each disease (diagnosis) with respect to the disease state(malignant or benign) and plot in a pie chart.
3. Find and plot via pia chart the total number of patients of each disease (diagnosis) with respect to the gender(Female, Male)

### Step 1: Cleaning Data

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("jsrt_metadata.csv")
print("Number of Missing values before Data cleaning")
print("---------------------------------------------")
print(df.isnull().sum())
print("---------------------------------------------")

# As [subtlety,size,x,y] is float value so we replace missing values by mean of the column
df['subtlety'].replace(np.nan, df['subtlety'].astype(float).mean(), inplace=True)
df['size'].replace(np.nan, df['size'].astype(float).mean(), inplace=True)
df['x'].replace(np.nan, df['x'].astype(float).mean(), inplace=True)
df['y'].replace(np.nan, df['y'].astype(float).mean(), inplace=True)

# As [position, diagnosis] is a categorical value, so we replace missing values by most occuring
df['position'].replace(np.nan, df['position'].value_counts().idxmax(), inplace=True)
df['diagnosis'].replace(np.nan, df['position'].value_counts().idxmax(), inplace=True)


print("Number of Missing values after Data cleaning")
print("---------------------------------------------")
print(df.isnull().sum())
print("---------------------------------------------")

### Step 2: Find the total number of patients of each disease (diagnosis) and plot in histogram.

In [None]:
import matplotlib as plt
from matplotlib import pyplot
df['diagnosis'].value_counts() # This gives the count of each disease

plt.pyplot.xlabel("Diagnosis")
plt.pyplot.ylabel("Number of Patients")
plt.pyplot.hist(df['diagnosis'], edgecolor="black")

### Step 3: Find the total number of patients of each disease (diagnosis) with respect to the disease state(malignant or benign) and plot in a pie chart.


In [None]:
import matplotlib as plt
from matplotlib import pyplot

# This gives us the total number of patient of each disease with respect to disease state (benign, malignant) and it's pie chart
for i in list(df['diagnosis'].unique()):
    plt.pyplot.figure()    # We create a new figure everytime so we get different plots.
    plt.pyplot.pie(df.loc[df['diagnosis'] == i]['state'].value_counts(),labels=list(df.loc[df['diagnosis'] == i]['state'].value_counts().keys()), autopct="%1.0f%%")
    plt.pyplot.legend(title= i)
    print(df.loc[df['diagnosis'] == i]['state'].value_counts())


### Step 3: Find the total number of patients of each disease (diagnosis) with respect to the patient gender (Male, Female) and plot in a pie chart.


In [None]:
import matplotlib as plt
from matplotlib import pyplot

# This gives us the total number of patient of each disease with respect to gender and it's pie chart
for i in list(df['diagnosis'].unique()):
    plt.pyplot.figure()
    plt.pyplot.pie(df.loc[df['diagnosis'] == i]['gender'].value_counts(),labels=list(df.loc[df['diagnosis'] == i]['gender'].value_counts().keys()), autopct="%1.0f%%")
    plt.pyplot.legend(title= i)
    print(df.loc[df['diagnosis'] == i]['state'].value_counts())