# Please run all notebooks at once

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import numpy as np
import math
import seaborn as sns

file_path = "data/data.csv"
column_names = ['sex', 'length', 'diameter', 'height', 'whole_weight', 'Shucked_weight', 'viscera_weight', 'shell_weight', 'rings']
columns_to_count = ['length', 'diameter', 'height', 'whole_weight', 'Shucked_weight', 'viscera_weight', 'shell_weight', 'rings']
data = pd.read_csv(file_path, header=None, names=column_names)

In [None]:
# First part - calculating sex distribution
counter = data['sex'].value_counts()
sex_distribution = pd.DataFrame({
    '': counter.index,
    'count': counter.values,
    '%': (counter.values / len(data) * 100).round(2)
})

sex_distribution[''] = sex_distribution[''].map({'M': 'Male', 'F': 'Female', 'I': 'Infant'})

# Show distribution
#print(sex_distribution)
display(sex_distribution)

In [None]:
# Second part - calculating statistics for the quantitative variables in the dataset
distributions = []


for i in columns_to_count:
        #counter = data[i]
        values = data[i].describe().round(2)
        #print(values)
        
        distribution = pd.DataFrame({
            '': [i],
            'mean': [values['mean']],
            'std': [values['std']],
            #'std2': [counter.std(ddof=1)],
            'min': [values['min']],
            '25%': [values['25%']],
            '50%': [values['50%']],
            '75%': [values['75%']],
            'max': [values['max']],
        })
        
        distributions.append(distribution) 
        
distributions = pd.concat(distributions, ignore_index=True)
distributions[''] = distributions[''].map({'length' : 'Lenght', 'diameter' : 'Diameter', 'height' : 'Height', 'whole_weight' : 'Whole weight', 'Shucked_weight' : 'Shucked weight', 'viscera_weight' : 'Viscera weight', 'shell_weight' : 'Shell weight', 'rings' : 'Rings'})
#print(distributions)
display(distributions)

In [None]:
# Third part - Bar plots
fig, ax = plt.subplots()
counter = data['sex'].value_counts()
sex = counter.index.values
#print(sex)
index = []
for i in sex:
    sex = [{'M': 'Male', 'F': 'Female', 'I': 'Infant'}.get(i, i)]
    index.append(sex[0])
#print(sex)
#print(index)

bar_labels = index
bar_colors = ['tab:red', 'tab:blue', 'tab:orange']
ax.bar(index, counter.values, label=bar_labels, color=bar_colors)
ax.set_ylabel('Population')
ax.set_title('Sex Distribution')
ax.legend(title='Sex')

plt.show()

In [None]:
# Fourth part - Histograms
fig, axs = plt.subplots(4, 2, tight_layout=True, figsize=(14, 20))

titles = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
labels = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
bins_width = {'length' : 0.1, 
        'diameter' : 0.1, 
        'height' : 0.1, 
        'whole_weight' : 0.25, 
        'Shucked_weight' : 0.1, 
        'viscera_weight' : 0.1, 
        'shell_weight' : 0.1, 
        'rings' : 2}
    
bin_index = 0
for i, ax in enumerate(axs.flat):
    counter_quantity = data[columns_to_count[bin_index]].values
    bin_width = bins_width[columns_to_count[bin_index]]
    first_bin = math.floor(min(counter_quantity) / bin_width) * bin_width
    last_bin = math.ceil(max(counter_quantity) / bin_width) * bin_width
    bins = np.arange(first_bin, last_bin + bin_width, bin_width)
    
    ax.hist(counter_quantity, bins=bins, color='blue', edgecolor='black', alpha=0.7)
    ax.set_title(f'{titles[i]}')
    ax.set_xlabel(f'{labels[i]}')
    ax.set_ylabel('Population')
    ax.set_xticks(np.arange(first_bin, last_bin + bin_width, bin_width))
    
    #fun(ax, bin_width, data)
    bin_index += 1
    
plt.show()


In [None]:
# Fifth part - Scatter plots
fig, axs = plt.subplots(14, 2, tight_layout=True, figsize=(14, 42))

index_x, index_y = 0, 0
visited = set()
for i in columns_to_count:
    for j in columns_to_count:
        if i == j:
            #print('te same')
            continue
        if (i, j) in visited or (j, i) in visited:
            #print('odwiedzone')
            continue
        
        values_x = data[i].values
        values_y = data[j].values
        ax = axs[index_x, index_y]
        ax.scatter(values_x, values_y)
        ax.set_xlabel(i)
        ax.set_ylabel(j)
        
        index_y += 1
        if index_y >= axs.shape[1]:
            index_y = 0
            index_x += 1    
            if index_x >= axs.shape[0]:
                break
        visited.add((i, j))
    if index_x >= axs.shape[0]:
                break   
plt.show()

In [None]:
# Sixth part - Correlation matrix
data_correlation = pd.read_csv(file_path, header=None, names=column_names, index_col=0)
correlation_matrix = data_correlation.corr()

#np.fill_diagonal(correlation_matrix.values, np.nan)
correlation_matrix.columns = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
correlation_matrix.index = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

correlation_matrix

In [None]:
# Seventh part - Heatmap of linear correlation matrix
plt.figure(figsize=(14, 10))
#mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
#sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', linewidths=0.8, cbar_kws={"shrink": 0.9})

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.8, cbar_kws={"shrink": 0.9})
plt.title('Heatmap of linear correlation matrix')

plt.show()

In [None]:
# Eight part - linear regression plot
correlation_matrix_names = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

most_correlated = correlation_matrix.unstack().sort_values()
most_correlated = most_correlated[most_correlated != True].last_valid_index()

v1, v2 = most_correlated
#print(v1, v2)
#print(correlation_matrix_names.index(v1))
#print(correlation_matrix_names.index(v2))

v1_data = data[columns_to_count[correlation_matrix_names.index(v1)]]
v2_data = data[columns_to_count[correlation_matrix_names.index(v2)]]
#print(columns_to_count[correlation_matrix_names.index(v2)].index())

plt.figure(figsize=(8, 6))
sns.regplot(x=v1_data, y=v2_data, data=data, scatter_kws={'s':5}, line_kws={'color':'red'})
plt.title(f'Linear Regression: {v1} & {v2}')
plt.xlabel(v1)
plt.ylabel(v2)
plt.show()


In [None]:
# First part for grade 5 - calculating statistics for the quantitative variables in the dataset diversity the sex

distributions = []

groups = data.groupby('sex')

for i in columns_to_count:
    for sex, group_data in groups:
        values = group_data[i].describe().round(2)
        
        distribution = pd.DataFrame({
            'Feature': [i],
            'Sex': [sex],
            'mean': [values['mean']],
            'std': [values['std']],
            'min': [values['min']],
            '25%': [values['25%']],
            '50%': [values['50%']],
            '75%': [values['75%']],
            'max': [values['max']],
        })
        
        distributions.append(distribution)
        
distributions = pd.concat(distributions, ignore_index=True)

distributions['Feature'] = distributions['Feature'].map({
    'length': 'Length',
    'diameter': 'Diameter',
    'height': 'Height',
    'whole_weight': 'Whole weight',
    'Shucked_weight': 'Shucked weight',
    'viscera_weight': 'Viscera weight',
    'shell_weight': 'Shell weight',
    'rings': 'Rings'
})

distributions['Feature'] = distributions['Feature'].mask(distributions['Feature'].duplicated(), '')
display(distributions)


In [None]:
# Second part for grade 5 - boxplots for the quantitative variables in the dataset diversity the sex

fig, axs = plt.subplots(4, 2, tight_layout=True, figsize=(14, 20))
sex_values = ['M', 'F', 'I']
sex_labels = ['Male', 'Female', 'Infant']

groups = data.groupby('sex')
for i, ax in enumerate(axs.flat):
    vals = [groups.get_group(sex)[columns_to_count[i]] for sex in sex_values]
    
    ax.boxplot(vals, patch_artist=True, tick_labels=sex_values)
    ax.set_title(f'{titles[i]}')
    ax.set_ylabel(f'{labels[i]}')
    
    