### Habermans Dataset

About dataset
Haberman's Survival Data
The dataset contains cases from a study that was conducted between 1958 and 1970 at the University of Chicago's Billings Hospital on the survival of patients who had undergone surgery for breast cancer.

Attribute Information:
1.Age of patient at time of operation (numerical)

2.Patient's year of operation (year - 1900, numerical)

3.Number of positive axillary nodes detected (numerical)

4.Survival status (1 = the patient survived 5 years or longer, 2 = the patient died within 5 years, class attribute)

#### OBJECTIVE:
We have to find out whether the patients will survive more than 5 years or not.

Clearly a problem of Binary Classification since the final outcome can be only two classes either yes or no.

In [None]:
#importing required libraries
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable
import numpy as np # linear algebra
import os
for dirname, _, filenames in os.walk('/kaggle/input/habermans.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#importing CSV dataset into pandas dataframe and set column names
df = pd.read_csv("../input/habermans-survival-data-set/haberman.csv",names=["Age","Year","A-Nodes","Survival"])#importing required libraries


In [None]:
#view first five rows of dataset
df.head()

In [None]:
#count of rows and columns in the dataset
df.shape

In [None]:
#overview of the dataset
df.describe()

In [None]:
#Datatype of each column
df.info()

In [None]:
#checking null values in data set
df.isnull()

In [None]:
#check null values in data set, here axis=0 represents columns(No null values in dataset)
df.isnull().sum(axis=0)

In [None]:
#list of unique values in column 3 with their count .
# value_count() method returns the count of unique entries in that column.
a = df.iloc[:,3:].value_counts()
print(a)

In [None]:
#Plot bar graph for survived patients vs Died.
graph=plt.figure(figsize=(5,5))
plt.ylabel("Count")
plt.title("No. of Surived patients vs Died")
plt.bar(["Survived","Didn't Survive"],a)
plt.show()

This bar graph shows that the dataset is imbalanced because of vast difference in ranges of survived vs died patients for binary classification

In [None]:
#Age distribution in dataset
plt.hist(df["Age"])
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Age Distribution in the Dataset")
plt.show()

In [None]:
#Age vs Number of A-Nodes scatter plot with their survival status
fig, ax = plt.subplots()
scatter = ax.scatter(df["Age"],df["A-Nodes"],c=df["Survival"])
legend1 = ax.legend(*scatter.legend_elements(),loc="upper left", title="Survival")
plt.xlabel("Age")
plt.ylabel("No. of A-Nodes")
plt.title("Age vs No. of A-Nodes")

plt.show()

From above scatter plot we get the idea that chances of survival increases with decrease in age and less number of A-nodes in a patient

In [None]:
# Default heatmap
p1 = sns.heatmap(df)


In [None]:
#Finding correlation among attributes
corr = df.corr()
sns.heatmap(corr, xticklabels=corr.columns,yticklabels=corr.columns)

In [None]:
#pairplot for finding correlations between attributes
sns.set(style="ticks", color_codes=False)
g = sns.pairplot(df)
plt.show()

In [None]:
#Drop year coloum
df = df.drop("Year",axis=1)

In [None]:
df.head()

## 1a. Decision Tree algorithm on Imbalanced Dataset

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle

In [None]:
X = df.iloc[:,:2]
y = df["Survival"]

now splitting dataset into training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

 entropy is a measure of the randomness in the information being processed. The higher the entropy, the harder it is to draw any conclusions from that information. 

In [None]:
# Training & testing the model and printing the accuracy
clf = DecisionTreeClassifier(criterion="entropy",max_depth=2) 
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
acc_1a = metrics.accuracy_score(y_test,y_pred)
pre_1a = metrics.precision_score(y_test,y_pred)
print("Accuracy:", acc_1a, pre_1a)

## 1b. Decision Tree algorithm on balanced Dataset

preparing the balanced dataset and randomly shuffling


In [None]:
#Using the loc() function, we can access the data values fitted in the particular row or
#column based on the index value passed to the function.
# randomly sample 81 rows from the dataframe
df_1 = df.loc[df["Survival"]==1]
df_1 = df_1.sample(n=81)
df_2 = df.loc[df["Survival"]==2]
frames = [df_1,df_2]
b_df = pd.concat(frames)
print(b_df.head())
b_df = shuffle(b_df)


In [None]:
b_df.head()

In [None]:
#reseting the indicies
b_df = b_df.reset_index(inplace=False)
b_df = b_df.drop("index",axis=1)
b_df.head()

In [None]:
#splitting X(independent) and y(dependent) variables
Xb = b_df.iloc[:,:2]
yb = b_df["Survival"]
X_trainb, X_testb, y_trainb, y_testb = train_test_split(Xb, yb, test_size=0.3, random_state=1)

In [None]:
# Training & testing the model and printing the accuracy
clf = DecisionTreeClassifier(criterion="entropy",max_depth=2) 
clf = clf.fit(X_trainb,y_trainb)
y_pred = clf.predict(X_testb)
acc_1b = metrics.accuracy_score(y_testb,y_pred)
pre_1b = metrics.precision_score(y_testb,y_pred)
print("Accuracy:",acc_1b,pre_1b)

In [None]:
#pretty table
myTable = PrettyTable(["Algorithm", "Accuracy (Imbalanced Dataset)", "Accuracy (Balanced Dataset)"])
myTable.add_row(["Decision Tree", acc_1a, acc_1b])
print(myTable)

## 2a. Random Forest Algorithm on Imbalanced Dataset

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=16) # creating a RF classifier. n_estimators : This is the number of trees you want to build
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test) # applying trained classifer to test
acc_2a = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",acc_2a)

## 2b. Random Forest Algorithm on Balanced Dataset

In [None]:
clf=RandomForestClassifier(n_estimators=20)
clf.fit(X_trainb,y_trainb)
y_pred=clf.predict(X_testb)
acc_2b = metrics.accuracy_score(y_testb, y_pred)
print("Accuracy:",acc_2b)

In [None]:
myTable.add_row(["Random Forest", acc_2a, acc_2b])
print(myTable)

## 3a. KNN Algorithm on imbalanced Dataset

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=7)
model.fit(X_train,y_train)
predicted= model.predict(X_test)
acc_3a = metrics.accuracy_score(y_test, predicted)
print("Accuracy:",acc_3a)

## 3b. KNN Algorithm on Balanced Dataset

In [None]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_trainb,y_trainb)
predicted= model.predict(X_testb)
acc_3b = metrics.accuracy_score(y_testb, predicted)
print("Accuracy:",acc_3b)

In [None]:
myTable.add_row(["KNN", acc_3a, acc_3b])
print(myTable)

## 4a. Naive Bayes Algorithm on Imbalanced Dataset

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)
predicted = model.predict(X_test)
acc_4a = metrics.accuracy_score(y_test, predicted)
print("Accuracy:", acc_4a)

## 4b. Naive Bayes Algorithm on Balanced Dataset¶

In [None]:
model = GaussianNB()
model.fit(X_trainb,y_trainb)
predicted = model.predict(X_testb)
acc_4b = metrics.accuracy_score(y_testb, predicted)
print("Accuracy:",acc_4b)

In [None]:
myTable.add_row(["Naive Bayes", acc_4a, acc_4b])
print(myTable)

## 5a. Support Vector Machine Algorithm on Imbalanced Dataset

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='poly')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_5a = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",acc_5a)

## 5b. Support Vector Machine Algorithm on Balanced Dataset

In [None]:
clf = svm.SVC(kernel='poly')
clf.fit(X_trainb, y_trainb)
y_pred = clf.predict(X_testb)
acc_5b = metrics.accuracy_score(y_testb, y_pred)
print("Accuracy:",acc_5b)

In [None]:
myTable.add_row(["SVM", acc_5a, acc_5b])
print(myTable)