# Exploratory Data Analysis
## Haberman's Data set

Contains details of cancer survival rates

In [None]:
#Import required packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels import robust

In [None]:
#Read the data set
df = pd.read_csv('haberman.csv', names = ['Age','Op_Year','axil_nodes','Surv_status'])
df

In [None]:
df.shape

In [None]:
#Find out survival status counts
df['Surv_status'].value_counts()

In [None]:
#Research and understanding on the features of the data set
print(df.columns)

#To find the range of years for data set
print('Operation years range: {0} - {1}'.format(df['Op_Year'].min(),df['Op_Year'].max()))

Data set contains information on patients who have undergone surgery for breast cancer
Understanding of above columns:
1. Age: Age of the patient as of surgery
2. Op_Year: Year of operation (Between 1958-1970 as per kaggle description, but 1958-1970 as per data range)
3. axil_nodes: Number of positive axilliary lymph nodes detected during surgery
4. Surv_status: Survival status of the patient (1 = Patient survived >= 5 years; 2 = Patient died < 5 years)

Sources:
1. https://www.kaggle.com/gilsousa/habermans-survival-data-set
2. https://www.medicalnewstoday.com/articles/319713.php

### Objective: Perform EDA to understand the survival status (Class label) of a patent given Age, Op_Year and axil_nodes

### Univariate analysis

In [None]:
#Distance plot for age relationship with survival status
sns.FacetGrid(df, hue = 'Surv_status', size = 5).map(sns.distplot, 'Age').add_legend()
plt.show()

Ages 30-33 have an assured survival rate of 5+ years while ages 78-83 do not survive beyond 5 years
Too much overlap to extract further information from Age on survival rate

In [None]:
#Distance plot to show relation of operation year with survival status
sns.FacetGrid(df, hue = 'Surv_status', size = 5).map(sns.distplot, 'Op_Year').add_legend()
plt.show()

Survival status seems to not have much relation with the operation year as overlap is throughout the range of years
Hence, no assumptions can be made on the impact of improving surgery technology towards the survival rate of a patient

In [None]:
#Distance plot to show relation between number of axil nodes and survival status
sns.FacetGrid(df, hue = 'Surv_status', size = 5).map(sns.distplot, 'axil_nodes').add_legend()
plt.show()

For fewer number of nodes (0-4), there is a far higher (30% - 15%) chance of the patient surviving beyond 5 years, while as the number of axil nodes increases beyond 5 and upto 25, the patient ranges between 4% - 2% chances of surviving below 5 years, and the chances of surviving beyond 5 years is less than 2%

In [None]:
#Distribute the data frame based on survival status
df_1 = df.loc[df['Surv_status'] == 1]
df_2 = df.loc[df['Surv_status'] == 2]

In [None]:
#PDF and CDF for age of patients with survival status 1
counts, bin_edges = np.histogram(df_1['Age'], bins = 10, density = True)
pdf = counts / (sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf,x = 'Age')
plt.plot(bin_edges[1:],cdf,x = 'Age')
plt.show()

As per cumulative distribution function (CDF), 75% of patients below age 60 have a survival status below 5 years

In [None]:
#PDF and CDF for axil nodes of patients with survival status 1
counts, bin_edges = np.histogram(df_1['axil_nodes'], bins = 10, density = True)
pdf = counts / (sum(counts))
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf,x = 'axil_nodes')
plt.plot(bin_edges[1:],cdf,x = 'axil_nodes')
plt.show()