In [1]:
# imports
import numpy as np
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
from sklearn.feature_selection import SelectKBest, f_classif
df = pd.read_csv('mammographic_masses_data.csv')


In [2]:
# 1.1 Showing the first 5 rows of the dataset
df.head()
df.describe()

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.300313,55.487448,2.721505,2.796276,2.910734,0.463059
std,0.683469,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,6.0,96.0,4.0,5.0,4.0,1.0


In [3]:
# 2.2 Show the points in the dataset where the Severity is 1
loc_df = df.loc[df['Severity'] == 1 , 'BA'] #can choose to only show specific characteristics with severity == 1
print(loc_df)

0      5.0
1      4.0
2      5.0
4      5.0
8      5.0
      ... 
951    5.0
952    4.0
955    4.0
957    4.0
959    5.0
Name: BA, Length: 445, dtype: float64


In [4]:
ageHist = px.histogram(df[['Age']])
baScatter = px.scatter(df, x='BA', y='Age', color='BA', color_continuous_scale=px.colors.sequential.Bluered_r)
severity_density_counts = df.['Severity', 'Density'].size().reset_index(name='counts')
severity_density_counts_fig = px.bar(severity_density_counts)

severity_density_counts_fig.show()
baScatter.show()
ageHist.show()

In [5]:
#3.1


df_cp = df.copy()
# Removes empty cells. inplace=True makes sure to create a new dataset with the changes saved
df_cp.dropna(inplace=True)


ageHist_cp = px.histogram(df_cp[['Age']])
baScatter_cp = px.scatter(df_cp, x='BA', y='Age', color='BA', color_continuous_scale=px.colors.sequential.Bluered_r)

# Making side by side plots 
sbs = make_subplots(rows=1, cols=2, subplot_titles=
        ("Before dropna", "After dropna"))
# Creating
sbs.update_layout(height=600, width=1800, title_text="Side by Side Visualizations")

# OG version before dropna function
hist = ageHist.data[0]
# after dropna function
hist_cp = ageHist_cp.data[0]

scatter = baScatter_cp.data[0]

# Adding graphs to side side by side graph 
# sbs.add_trace(hist, row=1, col=1)
# sbs.add_trace(hist_cp, row=1, col=2)
# sbs.show()


# ageHist.show()
# baScatter.show()

# 3.2

# Sinlge column normalization
def lin_norm(val, col):
    val = val[[col]]
    max_val = val.max()
    min_val = val.min()
    return (val - min_val) / (max_val - min_val)

# Square Root Normalization
def sqrt_norm(val, col):
    val = val[[col]]
    max_val = val.max()
    min_val = val.min()
    return np.sqrt((val - min_val) / (max_val - min_val))

# Logarithmic Normalizatio
def log_norm(val, col):
    val = val[[col]]
    min_val = val.min()
    max_val = val.max()
    return (np.log(val) - np.log(min_val)) /(np.log(max_val  - np.log(min_val)))

# df_lin_norm = lin_norm(df_cp, 'Age')
# df_lin_norm[['Age']].head()

df_lin_norm_test = lin_norm(df_cp, 'Age')
df_lin_norm_test.head()

df_lin_norm_test_graph = px.histogram(df_lin_norm_test)
df_lin_norm_test_graph.show()
print(df_lin_norm_test)
# df_sqrt_norm = sqrt_norm(df_cp)
# df_sqrt_norm.head()

# df_log_norm = log_norm(df_cp)
# df_log_norm.head()

          Age
0    0.628205
2    0.512821
3    0.128205
8    0.500000
10   0.743590
..        ...
956  0.371795
957  0.487179
958  0.589744
959  0.615385
960  0.564103

[830 rows x 1 columns]


In [6]:
#4
