In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
data = pd.read_csv('../dataset/mammographic_masses.data', header=None)
data.columns = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']

data = data.replace('?', np.nan) # replace ? with NaN
data = data.dropna()
data = data.astype('int') # convert all columns to int

print(data.shape)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
sns.countplot(x='Severity', data=data)
plt.title('Severity Distribution \n (0: Benign | 1: Malignant)', fontsize=14)

print('Benign: ', round(data['Severity'].value_counts()[0]/len(data) * 100,2), '% of the dataset')
print('Malignant: ', round(data['Severity'].value_counts()[1]/len(data) * 100,2), '% of the dataset')


In [None]:
age_benign = data.Age[data.Severity == 0].values


plt.figure(figsize=(25,8))

sns.histplot(age_benign, bins=1, kde=True, color='green', label='Benign')
plt.title('Benign - Age', fontsize=14)

In [None]:
age_malign = data.Age[data.Severity == 1].values


plt.figure(figsize=(25,8))

sns.histplot(age_malign, bins=1, kde=True, color='red', label='Malignant')
plt.title('Malignant - Age', fontsize=14)

Correlation Matrices

In [None]:
corr = data.corr()

plt.figure(figsize=(10, 7))

sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)
plt.title('Correlation Matrix', fontsize=14)

In [None]:
f, axes = plt.subplots(ncols=4, figsize=(20, 8))

sns.boxplot(x='Severity', y='Age', data=data, ax=axes[0])
axes[0].set_title('Severity - Age', fontsize=14) 

sns.boxplot(x='Severity', y='BI-RADS', data=data, ax=axes[1])
axes[1].set_title('Severity - BI-RADS', fontsize=14) 

sns.boxplot(x='Severity', y='Shape', data=data, ax=axes[2])
axes[2].set_title('Severity - Shape', fontsize=14)

sns.boxplot(x='Severity', y='Margin', data=data, ax=axes[3])
axes[3].set_title('Severity - Margin', fontsize=14) 


In [None]:
data[data['BI-RADS'] == 6]

In [None]:
data[data['BI-RADS'] == 0]

In [None]:
plt.scatter(data['BI-RADS'], data['Age'])