In [25]:
# imports
import numpy as np
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
df = pd.read_csv('mammographic_masses_data.csv', na_values=['NA', 'null', '', 'NULL'])


In [26]:
# 1.1 Showing the first 5 rows of the dataset
df.head()
df.describe()

Unnamed: 0,BA,Age,Shape,Margin,Density,Severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.300313,55.487448,2.721505,2.796276,2.910734,0.463059
std,0.683469,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,6.0,96.0,4.0,5.0,4.0,1.0


In [27]:

freq_BA = df['BA'].value_counts()
freq_Shape = df['Shape'].value_counts()
freq_Margin = df['Margin'].value_counts()
freq_Density = df['Density'].value_counts()
freq_Severity = df['Severity'].value_counts()

print(freq_BA)
print(freq_Shape)
print(freq_Margin)
print(freq_Density)
print(freq_Severity)


BA
4.0    547
5.0    346
3.0     36
2.0     14
6.0     11
1.0      4
0.0      1
Name: count, dtype: int64
Shape
4.0    400
1.0    224
2.0    211
3.0     95
Name: count, dtype: int64
Margin
1.0    357
4.0    280
5.0    136
3.0    116
2.0     24
Name: count, dtype: int64
Density
3.0    798
2.0     59
1.0     16
4.0     12
Name: count, dtype: int64
Severity
0    516
1    445
Name: count, dtype: int64


Which functions do you think everyone should be aware of to render out summary statistics? Mean: average of the given numbers (only for Age) Standard deviation: the average variability in the given numbers (for age and all of other factor variables, discrete values) Min: the lowest value for the given numbers in our dataset (range) Max: the maximum value for the given numbers in our dataset (range)

percentiles? i don't find them so relevant

in the case of the ordinal variables (BA, Shape, Margin, Density and Severity), the frequency of the

In [28]:
# 2.2 Show the points in the dataset where the Severity is 1
loc_df = df.loc[df['Severity'] == 1] #can choose to only show specific  with severity == 1
print(loc_df)

      BA   Age  Shape  Margin  Density  Severity
0    5.0  67.0    3.0     5.0      3.0         1
1    4.0  43.0    1.0     1.0      NaN         1
2    5.0  58.0    4.0     5.0      3.0         1
4    5.0  74.0    1.0     5.0      NaN         1
8    5.0  57.0    1.0     5.0      3.0         1
..   ...   ...    ...     ...      ...       ...
951  5.0  67.0    4.0     5.0      3.0         1
952  4.0  68.0    4.0     4.0      3.0         1
955  4.0  52.0    4.0     4.0      3.0         1
957  4.0  56.0    4.0     5.0      3.0         1
959  5.0  66.0    4.0     5.0      3.0         1

[445 rows x 6 columns]


In [29]:
ageHist = px.histogram(df[['Age']])
baScatter = px.scatter(df, x='BA', y='Age', color='BA', color_continuous_scale=px.colors.sequential.Bluered_r)
severity_density_counts = df.groupby(['Severity', 'Density']).size().reset_index(name='Count')
fig = px.bar(severity_density_counts, x="Severity", y="Count", color="Density", title="Severity and Density Counts")

boxplot = px.box(df, x='Severity', y='Age')


boxplot.show()
fig.show()
baScatter.show()
ageHist.show()

In [30]:
#3.1

df_cp = df.copy()
df_cp.dropna(inplace=True)

ageHist_cp = px.histogram(df_cp[['Age']])
baScatter_cp = px.scatter(df_cp, x='BA', y='Age', color='BA', color_continuous_scale=px.colors.sequential.Bluered_r)

# Making side by side plots 
sbs = make_subplots(rows=1, cols=2, subplot_titles=
        ("Before dropna", "After dropna"))
# Creating
sbs.update_layout(height=600, width=1800, title_text="Side by Side Visualizations")

# OG version before dropna function
hist = ageHist.data[0]
# after dropna function
hist_cp = ageHist_cp.data[0]

scatter = baScatter_cp.data[0]

# Adding graphs to side side by side graph 
sbs.add_trace(hist, row=1, col=1)
sbs.add_trace(hist_cp, row=1, col=2)
sbs.show()

# 3.2
# Sinlge column normalization
def lin_norm(val, col):
    val = val[[col]]
    max_val = val.max()
    min_val = val.min()
    return (val - min_val) / (max_val - min_val)

df_lin_norm = lin_norm(df_cp, 'Age')
df_lin_norm[['Age']].head()

# Plotting the normalized data
normAgeHist = px.histogram(df_lin_norm[['Age']])
normAgeHist.update_layout(title_text="Normalized Age Histogram")

normAgeHist.show()

In [31]:
#4.1
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Target Severity
X = df[['BA', 'Shape', 'Margin', 'Density', 'Age']]
y = df['Severity']

# Drop rows with missing values for simplicity
# You might also consider imputation or other strategies
X = X.dropna()
y = y[X.index] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SelectKBest to select 2 best
selector = SelectKBest(f_classif, k=2)
X_new = selector.fit_transform(X_train, y_train)

print(X.shape)
# Check the shape of the transformed dataset
print(X_new.shape)

selected_features = X_train.columns[selector.get_support()]
print("Selected features:", selected_features)


(830, 5)
(581, 2)
Selected features: Index(['Shape', 'Margin'], dtype='object')


In [32]:
#4.1.plots
import plotly.graph_objects as go

# Create a bar chart with Plotly
fig = go.Figure()
feature_names = ['BA', 'Shape', 'Margin', 'Density', 'Age']

# Fit the feature selector to get the scores
selector = SelectKBest(f_classif, k='all')
selector.fit(X_train, y_train)

scores = selector.scores_

X_indices = np.arange(X.shape[1])


fig.add_trace(go.Bar(
    x=X_indices,
    y=scores,
    marker=dict(color='royalblue')
))

fig.update_layout(
    title="Feature Univariate Scores",
    xaxis_title="Feature",
    yaxis_title="Univariate Score (F-value)",
    xaxis=dict(
        tickangle=-45,  
        tickmode='array',  
        tickvals=list(range(len(feature_names))),  
        ticktext=feature_names  
    )
)

fig.show()

In [33]:
# 4.2
import plotly.express as px
from sklearn.decomposition import PCA


df.dropna(inplace=True)
X = df[['BA', 'Shape', 'Margin', 'Density', 'Age']]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

print(components)
fig.update_layout(
    xaxis_title="PC1",
    yaxis_title="PC2"
)
fig.show()

fig.show()


[[ 11.31599482   1.32061038]
 [  2.36374711   2.42483299]
 [-27.88074989  -0.92984549]
 ...
 [  8.33834783   1.9225588 ]
 [ 10.35026533   1.96339801]
 [  6.21812803  -0.13395917]]


In [34]:
#4.3

from sklearn.decomposition import TruncatedSVD
df_svd = df.copy()
df_svd.dropna(inplace=True)

columns_for_svd = ['BA', 'Margin', 'Density', 'Age', 'Shape', 'Severity']
X = df_cp[columns_for_svd].values


n_components = 2

# Apply Truncated SVD
svd = TruncatedSVD(n_components=n_components)
X_svd = svd.fit_transform(X)

fig_svd = px.scatter(x=X_svd[:, 0], y=X_svd[:, 1], color=df_cp['Severity'], color_continuous_scale=px.colors.sequential.Redor_r)
fig_svd.update_layout(
    xaxis_title="PC1",
    yaxis_title="PC2"
)
fig_svd.show()



print(X_svd)

[[67.49079013  1.05008076]
 [58.59640663  2.49224057]
 [28.3598079   0.33537437]
 ...
 [64.47590323  1.48013312]
 [66.54560941  1.72133893]
 [62.34049594 -0.34660182]]
