In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# EDA : Haberman's Survival Data Set
# The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

# Data set information
**Age** of patient at time of operation (numerical)
**Patient**'s year of operation (year - 1900, numerical)
Number of positive axillary** nodes detected** (numerical)
**Survival status** (class attribute) :
    #1 = the patient survived 5 years or longer 
    #2 = the patient died within 5 year****

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
haberman = pd.read_csv("/kaggle/input/habermans-survival-data-set/haberman.csv")
haberman.columns=["age","year","nodes","status"]
haberman.head()

# High level description

In [None]:
haberman.describe()

In [None]:
haberman.shape

# We have two category for status
status = 1
and status = 2 
let's count the value for each of the status

In [None]:
haberman["status"].value_counts()

# Lets do some plotting to understand the relationship between variables 

In [None]:
haberman.plot(kind="scatter",x="age",y="status",legend=True)
# haberman.plot?

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(haberman,hue="status",size=5)\
.map(plt.scatter,"age","status")\
.add_legend();
plt.show()

In [None]:
## Pair Plot

In [None]:
plt.close()
# sns.pairplot

sns.pairplot(haberman,hue="status",vars=["age","nodes","year"],size=3)
plt.show()

In [None]:
class1 = haberman.loc[haberman["status"] == 1]
class2 = haberman.loc[haberman["status"]==2]
plt.plot(class1["nodes"],0+np.zeros_like(class1["nodes"]),"o",label="status1")

plt.plot(class2["nodes"],1+ np.zeros_like(class2["nodes"]),"*",label="status2")
plt.xlabel("nodes")
plt.ylabel("nodes")
plt.legend()
plt.title("1-D Scatter plot")
plt.show()



#### observation

Not useful because most of the data are very overlapping 
better options is to study the PDF andCDF

# Probability Density function and Cumalative density function graph

In [None]:
sns.FacetGrid(haberman,hue="status",size=5).map(sns.distplot,"age").add_legend()

**Age between 30 to 75 has same status for survival and non survival so this is not useful feature to look**

In [None]:

sns.FacetGrid(haberman,hue="status",aspect=2).map(sns.distplot,"nodes").add_legend()

In [None]:
# person with 0 lymph nodes and status 1
haberman_data = haberman[(haberman["nodes"] == 0 ) & (haberman["status"] == 1)]

In [None]:
haberman["status"].value_counts()

# Obeservation
probability that if people have zero lymph node they will live longer than five years is 0.52

most of the data are overlaaping but still we can say that if nuber of lymph nodes is less then there is high chance that patients will survive more than 5 years¶

In [None]:
sns.FacetGrid(haberman, hue="status", size=5) \
   .map(sns.distplot, "year") \
   .add_legend();
plt.show();


In [None]:
sns.FacetGrid(haberman, hue="status", size=5) \
   .map(sns.distplot, "age") \
   .add_legend();
plt.show();


# Observation
Age and year is not a good candidate for indentifying status because there are lots of overlapping region.

but nodes are better candidate for identfying status, we can clearly see that if nodes are less than 10 then there is a high chance of survival for more than 5 years.

In [None]:
status1 = haberman[haberman["status"] == 1]
status2 = haberman[haberman["status"] ==  2]

In [None]:
# status 1
counts,bin_edges=np.histogram(status1["nodes"],bins=10,density=True)
pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf,label="pdf plot status 1")
plt.plot(bin_edges[1:],cdf,label="CDF plot of status 1")
plt.title("PDF and CDF of Patient's nodes having status 1'")
plt.xlabel("Patient's lymph node count for status 1")

plt.legend()

# status 2
counts,bin_edges=np.histogram(status2["nodes"],bins=10,density=True)

pdf = counts/sum(counts)
cdf = np.cumsum(pdf)
plt.plot(bin_edges[1:],pdf,label="pdf plot status 2")
plt.plot(bin_edges[1:],cdf,label="CDF plot of status 2")
plt.title("PDF and CDF of Patient's nodes having status 2'")
plt.xlabel("Patient's lymph node count for status 2")

plt.legend()
plt.show()

In [None]:

print("\nQuantiles Status 1:")
np.median(status1["nodes"])
np.median(status1["age"])
print(np.percentile(status1["age"],np.arange(0, 100, 10)))
print(np.percentile(status1["nodes"],np.arange(0, 100, 10)))


In [None]:

print("\nQuantiles Status 2:")
np.median(status1["nodes"])
np.median(status1["age"])
print(np.percentile(status2["age"],np.arange(0, 100, 25)))
print(np.percentile(status2["nodes"],np.arange(0, 100, 25)))

In [None]:
print("90th percentile status2 ")
print(np.percentile(status2["nodes"],90))

print(np.percentile(status2["age"],90))

In [None]:
print("90th percentile status1 ")
print(np.percentile(status1["nodes"],90))

print(np.percentile(status1["age"],90))

# BOX PLOT

In [None]:
sns.boxplot(x="status",y="age",data=haberman)

In [None]:
sns.boxplot(x="status",y="nodes",data=haberman)

In [None]:
#print 75 the percentile of class 1 and  class 2 data

print(np.percentile(class1["nodes"],75))
print(np.percentile(class2["nodes"],75))

<!-- Observation -->
75% of the people who survived more than five years had less than 3 lymph nodes 75 % of the people who survived less than five years(class 2) had less than 11 lymph nodes