In [None]:
# importing the required libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as ss
import seaborn as sns
from scipy import stats
from scipy.optimize import curve_fit
from matplotlib.colors import ListedColormap
from numpy.polynomial import Polynomial as Poly
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Loading dataset using pandas 
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv', index_col = 'Person ID')
df.head()

In [None]:
# Number of rows and columns of dataset
df.shape

In [None]:
# Some basic statistics
df.describe()

In [None]:
# checking for empty values
df.isnull().sum()

In [None]:
# Selecting Numeric columns
numeric_df = df.select_dtypes(include = ['number'])
numeric_df

In [None]:
# Showing basic statistics like mean, median, standard deviation etc
def calculate_statistics(df):
    major_moments = pd.DataFrame({
        'Mean': df.mean(),
        'Median': df.median(),
        'Standard Deviation': df.std(),
        'Skewness': df.apply(stats.skew),
        'Kurtosis': df.apply(stats.kurtosis),
    })
    return major_moments

In [None]:
stats = calculate_statistics(numeric_df)
print(stats)

In [None]:
# Sometimes code doesn't work because all cells work in parallel thats's why I included this library here
from scipy import stats
# Storing some basic statistics in variables
age = df['Age']
mean_age = df['Age'].mean()
median_age = df['Age'].median()
mode_age = df['Age'].mode()[0]
skewness_age = stats.skew(df['Age'])
kurtosis_age = stats.kurtosis(df['Age'])

In [None]:
# Plotting Histogram
def plot_hist(age):
    """
    Creates a histogram of the given 'age' data, with vertical lines showing the mean, 
    median, and mode, as well as text annotations for skewness and kurtosis.
    """
    plt.figure(dpi = 144, figsize = (6, 4))
    plt.hist(age, bins=10, color = 'yellow', edgecolor='black', alpha = 0.7)
    # Plot mean, median, and mode as vertical lines
    plt.axvline(mean_age, color='red', linestyle = '--', linewidth = 2, label=f'Mean: {mean_age:.2f}')
    plt.axvline(median_age, color='green', linestyle='--', linewidth = 2, label = f'Median: {median_age:.2f}')
    plt.axvline(mode_age, color = 'blue', linestyle = '--', linewidth = 2, label = f'Mode: {mode_age}')    
    # Move skewness and kurtosis text to bottom right
    plt.text(0.95, 0.05, f'Skewness: {skewness_age:.2f}', 
         transform = plt.gca().transAxes, ha = 'right', va = 'bottom', fontsize = 10, color = 'black')
    plt.text(0.95, 0.10, f'Kurtosis: {kurtosis_age:.2f}', 
         transform = plt.gca().transAxes, ha = 'right', va = 'bottom', fontsize = 10, color = 'black')
    # Add labels and title
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.title('Age Distribution', fontsize = 10)
    plt.legend()
    plt.show()

In [None]:
plot_hist(age)

In [None]:
def plot_bar(df):
    """
    Plots a bar graph showing the count of each BMI category within each occupation.
    """
    # Count occurrences of each BMI category within each occupation
    plt.figure(dpi = 144)
    count_data = df.groupby(['Occupation', 'BMI Category']).size().unstack(fill_value=0)   
    # Plot a bar graph
    count_data.plot(kind='bar', figsize=(10, 6), stacked=False)    
    # Formatting the plot
    plt.title('Count of BMI Categories by Occupation', fontsize=15)
    plt.xlabel('Occupation', fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.xticks(rotation=45, fontsize=12)
    plt.legend(title='BMI Categories', fontsize=10)
    plt.grid(axis='y', linestyle='--', alpha=0.7)    
    # Show the plot
    plt.tight_layout()
    plt.show()

In [None]:
plot_bar(df)

In [None]:
# Storing specific column values in variables
sleep_duration = df['Sleep Duration']
age = df['Age']

In [None]:
# Function to plot scatter plot
def sleep_duration_vs_age_scatter(sleep_duration, age):
    """
    Creates a scatter plot of sleep duration against age.
    """
    #Creates a line graph of sleep duration against age
    plt.figure(dpi = 144, figsize = (10 ,6))  
    plt.scatter(sleep_duration, age, alpha = 0.7, color = 'purple') 
    #Setting title and labels
    plt.title('Sleep Duration vs Age', fontsize = 12)
    plt.xlabel('Sleep Duration')  
    plt.ylabel('Age') 
    plt.grid(True) 
    plt.tight_layout()  
    plt.show()

In [None]:
sleep_duration_vs_age_scatter(sleep_duration, age)

In [None]:
# Plottting Heatmaps
def plot_heatmap(numeric_df, method):
    """
    Plots a heatmap of the correlation matrix for a DataFrame using the specified correlation method.
    """
    fig, ax = plt.subplots(dpi = 144, figsize = (8, 4))
    mask = np.triu(np.ones_like(numeric_df.corr()))
    sns.heatmap(numeric_df.corr(method = method), ax = ax, vmin = -1, vmax = 1,
                cmap = 'RdBu', annot = True, mask = mask)
    # Rorating labels for readability
    plt.xticks(rotation=45, ha='right')
    # Setting title of heatmap
    plt.title(f" Heatmap Using {method.capitalize()} Correlation Matrix", fontsize = 10)
    plt.show()
    return

In [None]:
plot_heatmap(numeric_df, 'pearson')

In [None]:
#Plotting pairplot
def plot_pairplot(numeric_df):
    """
    Plots a pairplot for the given DataFrame to visualize pairwise relationships between numerical variables..
    """
    sns.pairplot(numeric_df, corner=True)
    plt.title("Pairplot of numeric featues", fontsize = 10)
    plt.show()

In [None]:
plot_pairplot(numeric_df)

In [None]:
# Linear function for fitting
def linfunc(x, a, b):
    """Linear function: y = ax + b"""
    return a * x + b

In [None]:
# Extract x and y columns
# Quality of Sleep column
x = numeric_df['Quality of Sleep'] 
# Stress level column
y = numeric_df['Stress Level'] 

In [None]:
# Fit using scipy's curve_fit
p, cov = curve_fit(linfunc, x, y)
sigma = np.sqrt(np.diag(cov))
print(f"Scipy fit: a = {p[0]:.2f} +/- {sigma[0]:.2f}")
print(f"b = {p[1]:.2f} +/- {sigma[1]:.2f}")

In [None]:
# Fit using numpy's Polynomial
p_np = Poly.fit(x, y, 1)  # Degree 1
cov_np = np.polyfit(x, y, 1, cov=True)[1]
sigma_np = np.sqrt(np.diag(cov_np))
b_np, a_np = p_np.convert().coef
print(f"NumPy fit: a = {a_np:.2f} ± {sigma_np[0]:.2f}, b = {b_np:.2f} ± {sigma_np[1]:.2f}")

In [None]:
# Performs Linear Fit
def plot_random_scatter_with_fitted_line(x, y):
    """
    Creates a scatter plot of random x and y data and linear fits onto the data
    """
    plt.figure(figsize=(10, 5), dpi=144)
    # scatter plot
    plt.scatter(x,y,color='blue', label='Data')
    x_pred = np.linspace(4, 9, 7)
    # make some data for the fitted line, using scipy
    xfit = np.linspace(np.min(x), np.max(x), 100)
    # plotting line
    plt.plot(xfit, linfunc(xfit, *p), 'r-', label=f'Scipy Fit: y = {p[0]:.2f}x + {p[1]:.2f}') 

    # see error margins as f(x) = ax + b
    plt.fill_between(xfit, linfunc(xfit, p[0] - sigma[0], p[1] - sigma[1]),
                     linfunc(xfit, p[0] + sigma[0], p[1] + sigma[1]), color='k', alpha=0.2, label='Scipy Error Margin')
    y_pred = linfunc(x_pred, *p)
    # Propagate error
    y_err = np.sqrt((x_pred * sigma[0])**2 + sigma[1]**2)  

    # Plot predicted points with error bars
    plt.errorbar(x_pred, y_pred, yerr=y_err, fmt='go', label='Predictions (with errors)', capsize=5)
    # formatting
    plt.xlabel('Quality of Sleep')
    plt.ylabel('Stress Level')
    plt.legend(fontsize=9)
    plt.title('Linear Fit with predictions', fontsize = 10)
    plt.grid()
    plt.show()
    return

In [None]:
plot_random_scatter_with_fitted_line(x, y)

In [None]:
# Transform data
df_cut = numeric_df[['Age', 'Sleep Duration']].copy()
scaler = MinMaxScaler()
norm = scaler.fit_transform(df_cut)
inv_norm = scaler.inverse_transform(norm)

In [None]:
def one_silhoutte_inertia(n, xy):
    """ 
    Calculates the silhoutte score and WCSS for n clusters 
    """
    # set up the clusterer with the number of expected clusters
    kmeans = KMeans(n_clusters = n, n_init = 20)
    # Fit the data
    kmeans.fit(xy)
    labels = kmeans.labels_   
    # calculate the silhoutte score
    score = silhouette_score(xy, labels)
    inertia = kmeans.inertia_
    return score, inertia

In [None]:
def plot_elbow_method(min_k, max_k, wcss, best_n):
    """
    Plots the elbow method between min_k and max_k
    """
    fig, ax = plt.subplots(figsize=(4,4), dpi=144)
    ax.plot(range(min_k, max_k + 1), wcss, 'kx-')
    ax.scatter(best_n, wcss[best_n-min_k], marker = 'o', color='red', facecolors = 'none', s = 50)
    ax.set_xlabel('k')
    ax.set_xlim(min_k, max_k)
    ax.set_ylabel('WCSS')
    plt.title("Elbow Method (best value for k)", fontsize = 10)
    plt.show()
    return

In [None]:
wcss = []
best_n, best_score = None, -np.inf
# 2 to 10 clusters
for n in range(2, 11):  
    score, inertia = one_silhoutte_inertia(n, norm)
    wcss.append(inertia)
    if score > best_score:
        best_n = n
        best_score = score
    print(f"{n:2g} clusters silhoutte score = {score:0.2f}")

print(f"Best number of clusters = {best_n:2g}")
plot_elbow_method(2, 10, wcss, best_n)

In [None]:
def plot_fitted_population_area(labels, xy, xkmeans, ykmeans, centre_labels):
    """
    Plots clustered data as a scatter plot with determined centres shown
    """
    colours = plt.cm.Set1(np.linspace(0, 1, len(np.unique(labels))))
    cmap = ListedColormap(colours)
    fig, ax = plt.subplots(figsize = (6,4), dpi = 144)
    s = ax.scatter(xy[:, 0], xy[:, 1], c = labels, cmap = cmap, marker = 'o', label = 'Data')
    ax.scatter(xkmeans, ykmeans, c = centre_labels, cmap = cmap, marker = 'x', s = 100, label = 'Estimated Centres')
    cbar = fig.colorbar(s, ax = ax)
    cbar.set_ticks(np.unique(labels))
    ax.legend()
    ax.set_xlabel('Age')
    ax.set_ylabel('Sleep Duration')
    plt.title("K-Means Clusters", fontsize = 10)
    plt.show()
    return

In [None]:
# this is important for plotting data accurately
inv_norm = scaler.inverse_transform(norm) 
kmeans = KMeans(n_clusters = best_n, n_init = 20)
# fit done on x,y pairs
kmeans.fit(norm)     
labels = kmeans.labels_    
# the estimated cluster centres
cen = scaler.inverse_transform(kmeans.cluster_centers_)
xkmeans = cen[:, 0]
ykmeans = cen[:, 1]
cenlabels = kmeans.predict(kmeans.cluster_centers_)
plot_fitted_population_area(labels, inv_norm, xkmeans, ykmeans, cenlabels)

In [None]:
def predict_and_plot_cluster(values, kmeans, scaler, labels, xy, xkmeans, ykmeans, cenlabels):
    """
    Predicts the cluster for a given set of values and visualizes it on the scatter plot.
    """
    # Normalize the input values
    values_norm = scaler.transform(values)    
    # Predict clusters for the normalized input
    predicted_clusters = kmeans.predict(values_norm)   
    # Plot original data and centers
    colours = plt.cm.Set1(np.linspace(0, 1, len(np.unique(labels))))
    cmap = ListedColormap(colours)
    fig, ax = plt.subplots(figsize = (8,4),dpi = 144)
    s = ax.scatter(xy[:, 0], xy[:, 1], c = labels, cmap = cmap, marker = 'o', label = 'Data')
    ax.scatter(xkmeans, ykmeans, c = cenlabels, cmap = cmap, marker = 'x', s = 100, label = 'Estimated Centres')    
    # Add the input values to the plot
    values_original = scaler.inverse_transform(values_norm)
    for (age, sleep, cluster) in zip(values_original[:, 0], values_original[:, 1], predicted_clusters):
        ax.scatter(age, sleep, color = colours[cluster], edgecolor = 'black', s = 200, label = f'Predictions')
        ax.annotate(f'Cluster {cluster}', (age, sleep), textcoords = "offset points", xytext = (0, 10), ha = 'center')    
    # Add color bar and labels
    cbar = fig.colorbar(s, ax = ax)
    cbar.set_ticks(np.unique(labels))
    ax.legend(loc = 'lower right')
    ax.set_xlabel('Age')
    ax.set_ylabel('Sleep Duration')
    ax.set_title('K-Means Cluster Prediction', fontsize = 10)
    ax.set_xlim([20,80])
    plt.show()    
    return predicted_clusters

In [None]:
# Example values to predict clusters for
input_values = np.array([[25, 8], [43, 6.2], [60, 8], [43,7.7], [25,6]])  
# Predict and visualize
predicted_clusters = predict_and_plot_cluster(
    values = input_values,
    kmeans = kmeans,
    scaler = scaler,
    labels = labels,
    xy = inv_norm,
    xkmeans = xkmeans,
    ykmeans = ykmeans,
    cenlabels = cenlabels
)
print("Predicted clusters for the input values:")
for value, cluster in zip(input_values, predicted_clusters):
    print(f"Values: {value}, Cluster: {cluster}")
