In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Lung Function Test/new3.csv')
df.head(5)

In [None]:
# Number of rows and columns
num_rows, num_cols = df.shape

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

In [None]:
# informations of the dataset
df.info()

In [None]:
print(df.isnull().sum())

In [None]:
# Descriptive statistics for quantitative variables (features)
# Summary for numerical columns (count, mean, std, min, max, etc.)
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram for demographical features - check the symmetric of the distribution
# Plot Age distribution
plt.figure(figsize=(8, 6))
sns.histplot(df, x='Age', bins=20, kde=True, color='skyblue')
plt.title('Distribution of Age in Obstructive Patients')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

bmi_skew = df['Age'].skew()
print(f"Age Skewness: {bmi_skew:.3f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot Height distribution
plt.figure(figsize=(8, 6))
sns.histplot(df, x=df.iloc[:, 3], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Height in Obstructive Patients')
plt.xlabel('Height')
plt.ylabel('Frequency')
plt.show()

bmi_skew = df.iloc[:, 3].skew()
print(f"Height Skewness: {bmi_skew:.3f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot Weight distribution
plt.figure(figsize=(8, 6))
sns.histplot(df, x=df.iloc[:, 4], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Weight in Obstructive Patients')
plt.xlabel('Weight')
plt.ylabel('Frequency')
plt.show()


bmi_skew = df.iloc[:, 4].skew()
print(f"Weight Skewness: {bmi_skew:.3f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot BMI distribution
plt.figure(figsize=(8, 6))
sns.histplot(df, x='BMI', bins=20, kde=True, color='skyblue')
plt.title('Distribution of BMI in Obstructive Patients')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.show()


bmi_skew = df.iloc[:, 5].skew()
print(f"Weight Skewness: {bmi_skew:.3f}")

In [None]:
# counts and percentages of qualitative variables
gender_counts = df['Gender'].value_counts()
print(gender_counts)

gender_percent = df['Gender'].value_counts(normalize=True) * 100
print(gender_percent)

In [None]:
gender_counts = df['BMI_Cat'].value_counts()
print(gender_counts)

gender_percent = df['BMI_Cat'].value_counts(normalize=True) * 100
print(gender_percent)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pointbiserialr, chi2_contingency, spearmanr

# --------------------------
# Mixed Correlation Coefficients and P-values included head maps
# --------------------------
def mixed_corr_matrix_with_pvalues(df):
    n = df.shape[1]
    corr_matrix = pd.DataFrame(np.zeros((n, n)), columns=df.columns, index=df.columns)
    pval_matrix = pd.DataFrame(np.ones((n, n)), columns=df.columns, index=df.columns)

    for col1 in df.columns:
        for col2 in df.columns:
            if df[col1].nunique() == 2 and df[col2].nunique() == 2:
                # Binary-binary using Phi coefficient
                contingency_table = pd.crosstab(df[col1], df[col2])
                chi2, p, _, _ = chi2_contingency(contingency_table)
                phi = np.sqrt(chi2 / len(df))
                corr_matrix.loc[col1, col2] = phi
                pval_matrix.loc[col1, col2] = p

            elif df[col1].nunique() != 2 and df[col2].nunique() != 2:
                # Numerical-numerical using Spearman correlation
                corr, p = spearmanr(df[col1], df[col2])
                corr_matrix.loc[col1, col2] = corr
                pval_matrix.loc[col1, col2] = p

            else:
                # Binary-numerical using Point-Biserial correlation
                if df[col1].nunique() == 2:
                    corr, p = pointbiserialr(df[col1], df[col2])
                else:
                    corr, p = pointbiserialr(df[col2], df[col1])
                corr_matrix.loc[col1, col2] = corr
                pval_matrix.loc[col1, col2] = p

    return corr_matrix, pval_matrix

# --------------------------
# Load and prepare data
# --------------------------

# Replace with your actual DataFrame loading
# data1 = pd.read_csv('your_file.csv')

# Assuming `df` is your original DataFrame:
df_New = df.iloc[:, 1:-11]  # Adjust this slice as needed

# Custom label encoding for Gender
custom_mapping = {'Male': 0, 'Female': 1}
cm = {'T': 1, 'F': 0}
df_New['Gender'] = df_New['Gender'].map(custom_mapping)
df_New['Type'] = df_New['Type'].map(cm)
# --------------------------
# Compute matrices
# --------------------------
corr_matrix, pval_matrix = mixed_corr_matrix_with_pvalues(df_New)

# --------------------------
# Plot correlation heatmap
# --------------------------
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Coefficient Matrix Heatmap - Obstructive and Restrictive Patients")
plt.show()

# --------------------------
# Plot p-value heatmap
# --------------------------
plt.figure(figsize=(12, 10))
sns.heatmap(pval_matrix, annot=True, cmap='viridis', fmt='.3f')
plt.title("P-Value Matrix Heatmap - Obstructive Patient and Restrictive Patients")
plt.show()
