# Predict whether the cancer is benign or malignant
#sklearn.utils.shuffle use the double amount of memory in my test. I'd like the in-place implemenation without using 2x times memory of the input array
#sklearn reduce the memory consumption
#matplotlib.gridspec.gridspec contains classes that help to layout multiple Axes in grid-like pattern within a figure .The GridSpace specifies overall grid structure


In [1]:
%matplotlib inline 
import tensorflow as tf
import pandas as pd
from sklearn.utils import shuffle
import matplotlib.gridspec as gridspec
import seaborn as sns
import matplotlib.pyplot as plt

# import data

In [2]:
train_file="D:/dataset.csv"

# set column keys

In [3]:
idKey = "id"
diagnosisKey = "diagnosis"
radiusMeanKey = "radius_mean"
textureMeanKey = "texture_mean"
perimeterMeanKey = "perimeter_mean"
areaMeanKey = "area_mean"
smoothnessMeanKey = "smoothness_mean"
compactnessMeanKey = "compactness_mean"
concavityMeanKey = "concavity_mean"
concavePointsMeanKey = "concave points_mean"
symmetryMeanKey = "symmetry_mean"
fractalDimensionMean = "fractal_dimension_mean"
radiusSeKey = "radius_se"
textureSeKey = "texture_se"
perimeterSeKey = "perimeter_se"
areaSeKey = "area_se"
smoothnessSeKey = "smoothness_se"
compactnessSeKey = "compactness_se"
concavitySeKey = "concavity_se"
concavePointsSeKey = "concave points_se"
symmetrySeKey = "symmetry_se"
fractalDimensionSeKey = "fractal_dimension_se"
radiusWorstKey = "radius_worst"
textureWorstKey = "texture_worst"
perimeterWorstKey = "perimeter_worst"
areaWorstKey = "area_worst"
smoothnessWorstKey = "smoothness_worst"
compactnessWorstKey = "compactness_worst"
concavityWorstKey = "concavity_worst"
concavePointsWorstKey = "concave points_worst"
symmetryWorstKey = "symmetry_worst"
fractalDimensionWorstKey = "fractal_dimension_worst"

In [4]:
train_columns = [idKey, diagnosisKey,radiusMeanKey,textureMeanKey, perimeterMeanKey,areaMeanKey, smoothnessMeanKey,
                 compactnessMeanKey, concavityMeanKey, concavePointsMeanKey, symmetryMeanKey,fractalDimensionMean,
                 radiusSeKey, textureSeKey, perimeterSeKey, areaSeKey,smoothnessSeKey, compactnessSeKey, concavitySeKey,
                 concavePointsSeKey, symmetrySeKey, fractalDimensionSeKey,
                 radiusWorstKey, textureWorstKey, perimeterWorstKey, areaWorstKey, 
                 smoothnessWorstKey, compactnessWorstKey, concavityWorstKey, concavePointsWorstKey, 
                 symmetryWorstKey, fractalDimensionWorstKey]

In [5]:
def get_train_data():
    df = pd.read_csv(link+"dataset.csv", names= train_columns, delimiter=',', skiprows=1)
    return df

In [6]:
train_data = get_train_data()

NameError: name 'link' is not defined

# exploring data

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.isnull().sum()

# how area_mean compare across malignant and benign diagnosis

In [None]:
print ("Malignant")
print (train_data.area_mean[train_data.diagnosis == "M"].describe())
print ()
print ("Benign")
print (train_data.area_mean[train_data.diagnosis == "B"].describe())

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,4))

bins = 50

ax1.hist(train_data.area_mean[train_data.diagnosis == "M"], bins = bins)
ax1.set_title('Malignant')

ax2.hist(train_data.area_mean[train_data.diagnosis == "B"], bins = bins)
ax2.set_title('Benign')

plt.xlabel('Area Mean')
plt.ylabel('Number of Diagnosis')
plt.show()

The 'area_mean' feature looks different as it increases its value across
both types of diagnosis. You could argue that malignant diagnosis are more
are more uniformly distributed, while benign diagnosis have a normal distribution.
This could make it easier to detect a malignant diagnosis when the area_mean is above the 750 value. Now let's
see how the diagnosis area_worst differs between the two types.

In [None]:
#Select only the rest of the features.
r_data = train_data.drop([idKey, areaMeanKey, areaWorstKey, diagnosisKey], axis=1)
r_features = r_data.columns

In [None]:
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(r_data[r_features]):
    ax = plt.subplot(gs[i])
    sns.distplot(train_data[cn][train_data.diagnosis == "M"], bins=50)
    sns.distplot(train_data[cn][train_data.diagnosis == "B"], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
plt.show()

# Update the value of diagnosis. 1 for Malignant and 0 for Benign

In [None]:
train_data.loc[train_data.diagnosis == "M", 'diagnosis'] = 1
train_data.loc[train_data.diagnosis == "B", 'diagnosis'] = 0

# Create a new feature for benign (non-malignant) diagnosis

In [None]:
train_data.loc[train_data.diagnosis == 0, 'benign'] = 1
train_data.loc[train_data.diagnosis == 1, 'benign'] = 0

# Convert benign column type to integer

In [None]:
train_data['benign'] = train_data.benign.astype(int)

# Rename 'Class' to 'Malignant'.

In [None]:
train_data = train_data.rename(columns={'diagnosis': 'malignant'})

In [None]:
print(train_data.benign.value_counts())
print()
print(train_data.malignant.value_counts())

In [None]:
pd.set_option("display.max_columns",101)
train_data.head()

# Create dataframes of only Malignant and Benign diagnosis

In [None]:
Malignant = train_data[train_data.malignant == 1]
Benign = train_data[train_data.benign == 1]

# Set train_X equal to 80% of the malignant diagnosis

In [None]:
train_X = Malignant.sample(frac=0.8)
count_Malignants = len(train_X)

# Add 80% of the benign diagnosis to train_X