# Data Visualization 
# Week 6: Intermediate Data Visualization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the model file into a DataFrame

df = pd.read_csv('data.csv')
df.sample(5)

In [None]:
# Print the information about dataset

print(df.info())

In [None]:
# Count missing values in DataFrame
print(df.isna().sum())

In [None]:
df_dropna = df.dropna()
df_dropna.isna().sum()

In [None]:
df_dropna.describe().round(2)

In [None]:
df_dropna.columns

## Box plot

In [None]:
col = ['Age', 'BMI', 'Glucose', 'Insulin', 'HOMA', 'Leptin', 'Adiponectin', 'Resistin']
data = df_dropna[col]

In [None]:
plt.figure(figsize = (10, 6), dpi = 300)

# Create the box plot
plt.boxplot(data.values, labels = data.columns)

# Set the title and labels
plt.title("Figure Title", fontsize=16)
plt.xlabel("X-axis", fontsize=14)
plt.ylabel("Y-axis", fontsize=14)
plt.xticks(rotation = 45)  

plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi = 300)

# Create the horizontal box plot 
plt.boxplot(data.values, labels = data.columns, vert=False)

plt.title("Figure Title", fontsize=16)
plt.xlabel("X-axis", fontsize=14)
plt.ylabel("Y-axis", fontsize=14)

plt.show()

In [None]:
plt.figure(figsize = (10, 6), dpi = 300)

# Create subplots for each column
for i, column in enumerate(data.columns):
    plt.subplot(2, 4, i+1)
    plt.boxplot(data[column])
    plt.ylabel(column)
    plt.xticks(fontsize = 10)
    plt.xticks([])  # Hide x-axis

# Adjust layout and spacing between subplots
plt.tight_layout()
plt.show()

## Histogram 

In [None]:
plt.figure(figsize=(4, 3))

#Create Histogram
plt.hist(data['Age'], bins='auto', edgecolor='black')  

plt.title("Histogram: Age", fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

plt.show()

In [None]:
# Define binning methods
bin = ['auto', 'sturges', 'sqrt', 'fd']
bin_method = ['Auto', 'Sturges', 'Square Root', 'Freedman and Diaconis']

In [None]:
plt.figure(figsize=(5, 5))

# Create Histograms using different binning methods
for i, method in enumerate(bin):
    plt.subplot(2, 2, i + 1)
    plt.hist(data['Age'], bins = method, edgecolor='black')
    plt.title(f"{bin_method[i]}", fontsize=14)
    plt.xlabel('Age', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Histogram with kernel density estimate (KDE)

plt.figure(figsize = (5, 5))

for i, method in enumerate(bin):
    plt.subplot(2, 2, i + 1)
    plt.hist(data['Age'], bins = method, density = True, edgecolor = 'black')
    # Add kernel density estimate (KDE)
    sns.kdeplot(data['Age'], color = 'red')  
    plt.title(f"{bin_method[i]}", fontsize = 14)
    plt.xlabel('Age', fontsize = 12)
    plt.ylabel('Frequency', fontsize = 12)

plt.tight_layout()
plt.show()

## Violin Plot

In [None]:
plt.figure(figsize = (10, 6), dpi = 300)

# Create a violin plot
plt.violinplot(data, showmedians = True)
 
plt.title("Figure Title", fontsize = 16)
plt.xlabel("X-axis", fontsize = 14)
plt.ylabel("Y-axis", fontsize = 14)
plt.xticks(ticks = range(1, len(data.columns) + 1), labels = data.columns, rotation = 45)

plt.show()

## Correlation matrix

In [None]:
# Calculate the correlation matrix

corr_matrix = data.corr().round(3)
corr_matrix

In [None]:
# Correlation matrix using seaborn

plt.figure(figsize = (6, 5), dpi = 300)
sns.heatmap(corr_matrix, annot = True, cmap = 'coolwarm')
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Correlation matrix with masking 

# Applying mask
mask = np.tril(np.ones_like(corr_matrix))

# Plotting a triangle correlation heatmap
plt.figure(figsize=(6, 4), dpi = 300)
sns.heatmap(corr_matrix, annot = True, mask = mask, cmap = 'coolwarm')
plt.title('Correlation Matrix', fontsize = 16)

plt.show()