Link to Datafile

In [2]:
datalink = "https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Legacy/50-50%20Balanced%20Dataset%20(RAW).csv"

Data Load & Null Checks

In [29]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats.mstats import winsorize

# Read the data file
df = pd.read_csv(datalink)

# Check for null values
null_counts = df.isnull().sum()
null_percentages = 100 * df.isnull().sum() / len(df)

# Combine the counts and percentages into a single dataframe
null_table = pd.concat([null_counts, null_percentages], axis=1, keys=['Null Count', 'Null Percentage'])

# Sort the table by null count in descending order
null_table = null_table.sort_values('Null Count', ascending=False)

# Display only columns with null values
print(null_table[null_table['Null Count'] > 0])

# If there are no null values, print a message
if null_table['Null Count'].sum() == 0:
    print("There are no null values in the dataset.")

# Quick summary of data
def summarize_dataframe(df):
    summary = pd.DataFrame({
        'Data Type': df.dtypes,
        'Non-Null Count': df.notnull().sum(),
        'Null Count': df.isnull().sum(),
        'Unique Values': df.nunique(),
        'First Value': df.iloc[0],
        'Second Value': df.iloc[1],
        'Third Value': df.iloc[2]
    })
    
    summary['Null Percentage'] = (100 * summary['Null Count'] / len(df)).round(1)
    
    return summary

# Get the summary of the DataFrame
summary_df = df.describe(include='all').transpose()

# Add additional information
summary_df['Null Count'] = df.isnull().sum()
summary_df['Null Percentage'] = (100 * df.isnull().sum() / len(df)).round(1)
summary_df['Data Type'] = df.dtypes

# Reorder columns for readability
summary_df = summary_df[['Data Type', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'Null Count', 'Null Percentage']]

print(summary_df)

                   Null Count  Null Percentage
HvyAlcoholConsump           2         0.002829
Sex                         2         0.002829
NoDocbcCost                 2         0.002829
                     Data Type    count          mean           std   min  \
ID                       int64  70692.0  35346.500000  20407.166952   1.0   
Diabetes_binary          int64  70692.0      0.500000      0.500004   0.0   
HighBP                   int64  70692.0      0.563458      0.495960   0.0   
HighChol                 int64  70692.0      0.525703      0.499342   0.0   
CholCheck                int64  70692.0      0.975259      0.155336   0.0   
BMI                      int64  70692.0     29.856985      7.113954  12.0   
Smoker                   int64  70692.0      0.475273      0.499392   0.0   
Stroke                   int64  70692.0      0.062171      0.241468   0.0   
HeartDiseaseorAttack     int64  70692.0      0.147810      0.354914   0.0   
PhysActivity             int64  70692.0   

Renaming Columns for Ease of Use

In [None]:
# Rename specific columns
df = df.rename(columns={'old_name1': 'new_name1', 'old_name2': 'new_name2'})

Replacing Nulls with Mean of Column

In [None]:
# Define the columns you want to clean
columns_to_clean = ['COLUMN_NAME1', 'COLUMN_NAME2', 'COLUMN_NAME3']  # Replace with the names of the columns you want to clean

# Iterate through each column and replace null values with the mean
for column in columns_to_clean:
    mean_value = df[column].mean()
    df[column].fillna(mean_value, inplace=True)

Winsorize some Column with Bad Outliers

In [None]:
# Define the columns you want to winsorize
columns_to_winsorize = ['COLUMN_NAME1', 'COLUMN_NAME2', 'COLUMN_NAME3']  # Replace with the names of the columns you want to winsorize

# Define the proportion of data to winsorize
winsorize_limits = 0.05  # Replace with your desired limit (e.g., 0.05 for 5%)

# Apply winsorization to each specified column
for column in columns_to_winsorize:
    df[column] = winsorize(df[column], limits=winsorize_limits)

Removing Nulls

In [None]:
# Define the columns to clean
columns_to_clean = ['COLUMN_NAME1', 'COLUMN_NAME2', 'COLUMN_NAME3']  # Replace 'COLUMN_NAME' with the name of the column you want to clean

# Remove rows with null values in the specified columns and overwrite the original DataFrame
df = df.dropna(subset=columns_to_clean)

Binning & Distribution

In [None]:
# Set plot style
sns.set_theme(style="whitegrid")

# Function to format y-axis labels with "K" for thousands
def format_yaxis(ax):
    ylabels = ax.get_yticks()
    ax.set_yticks(ylabels)
    ax.set_yticklabels([f'{int(y/1000)}K' for y in ylabels])

# Function to bin data
def bin_data(data, bins, labels):
    return pd.cut(data, bins=bins, labels=labels, include_lowest=True)

# Bin Column1, Column2, and Column3
df['GenHelth_binned'] = bin_data(df['GenHlth'], 
                                 bins=[0, 1, 5, 10, 15, 30], 
                                 labels=['0', '1-5', '6-10', '11-15', '16+'])


# 1. Distribution of the target variable
plt.figure(figsize=(8, 6))
ax = sns.countplot(x='Diabetes_binary', data=df)  # Replace 'TargetVariable' with your target column name
format_yaxis(ax)
plt.title('Distribution of Target Variable', fontweight='bold')
plt.xlabel('Target Variable (Description)', fontweight='bold')  # Replace description with relevant information
plt.ylabel('Count', fontweight='bold')
plt.show()

# 2. Distribution of numerical features
numerical_features = ['BMI','PhysHlth','MentHlth']  # Replace with your numerical column names
df[numerical_features].hist(bins=15, figsize=(15, 10), layout=(3, 3))
plt.suptitle('Distribution of Numerical Features', fontweight='bold')
for ax in plt.gcf().axes:
    format_yaxis(ax)
plt.show()

# 3. Distribution of binary features
binary_features = ['Smoker', 'Fruits', 'Stroke']  # Replace with your binary column names
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 16))
for i, feature in enumerate(binary_features):
    row, col = divmod(i, 3)
    ax = sns.countplot(x=feature, data=df, ax=axes[row, col])
    format_yaxis(ax)
    axes[row, col].set_title(f'Distribution of {feature}', fontweight='bold')
plt.suptitle('Distribution of Binary Features', fontweight='bold')
plt.tight_layout()
plt.show()

EDA 2

In [None]:
# 4. Correlation matrix
# Select only numeric columns for correlation
numeric_columns = df.select_dtypes(include=[np.number]).columns
corr_df = df[numeric_columns]

plt.figure(figsize=(15, 10))
corr = corr_df.corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix', fontweight='bold')
plt.show()

# 5. Relationships between features and the target variable
# Update numerical_features list with binned versions
numerical_features = ['Column1_binned', 'Column2', 'Column2_binned', 'Column3_binned', 'Column4', 'Column5', 'Column6']  # Replace with your binned and other numerical columns

# Calculate the number of rows and columns for the subplot grid
n_numerical = len(numerical_features)
n_binary = len(binary_features)
n_cols = 4
n_rows = -(-n_numerical // n_cols) + -(-n_binary // n_cols)  # Ceiling division

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
fig.suptitle('Relationships between Features and Target Variable', fontsize=16, fontweight='bold')

# Plot relationships for numerical features
for i, feature in enumerate(numerical_features):
    row, col = divmod(i, n_cols)
    sns.countplot(x=feature, hue='TargetVariable', data=df, ax=axes[row, col])  # Replace 'TargetVariable' with your target column name
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Number of Individuals')
    axes[row, col].legend(title='Target Variable', labels=['No', 'Yes'])  # Replace labels if needed
    axes[row, col].tick_params(axis='x', rotation=45)
    format_yaxis(axes[row, col])

# Plot relationships for binary features
start_row = -(-n_numerical // n_cols)  # Ceiling division
for i, feature in enumerate(binary_features):
    row, col = divmod(i, n_cols)
    row += start_row
    sns.countplot(x=feature, hue='TargetVariable', data=df, ax=axes[row, col])  # Replace 'TargetVariable' with your target column name
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Number of Individuals')
    axes[row, col].legend(title='Target Variable', labels=['No', 'Yes'])  # Replace labels if needed
    format_yaxis(axes[row, col])

# Remove any unused subplots
for i in range(n_numerical + n_binary, n_rows * n_cols):
    row, col = divmod(i, n_cols)
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.subplots_adjust(top=0.95)  # Adjust to make room for the suptitle
plt.show()

# 6. Relationships between features and the target variable (Percentage Stacked Bar Charts)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(24, 5*n_rows))  # Increased figure width
fig.suptitle('Relationships between Features and Target Variable (Percentage)', fontsize=16, fontweight='bold')

# Function to create percentage stacked bar chart
def percentage_stacked_bar(feature, ax):
    # Calculate percentages
    percentages = df.groupby(feature, observed=True)['TargetVariable'].value_counts(normalize=True).unstack()  # Replace 'TargetVariable' with your target column name
    # Plot stacked bar chart
    percentages.plot(kind='bar', stacked=True, ax=ax, width=0.8)
    ax.set_ylim(0, 1)
    ax.set_ylabel('Percentage')
    ax.set_title(f'{feature}', fontweight='bold')
    ax.tick_params(axis='x', rotation=45)
    
    # Move legend outside the plot
    ax.legend(title='Target Variable', labels=['No', 'Yes'], bbox_to_anchor=(1.05, 1), loc='upper left')  # Replace labels if needed

# Plot relationships for numerical features
for i, feature in enumerate(numerical_features):
    row, col = divmod(i, n_cols)
    percentage_stacked_bar(feature, axes[row, col])

# Plot relationships for binary features
start_row = -(-n_numerical // n_cols)  # Ceiling division
for i, feature in enumerate(binary_features):
    row, col = divmod(i, n_cols)
    row += start_row
    percentage_stacked_bar(feature, axes[row, col])

# Remove any unused subplots
for i in range(n_numerical + n_binary, n_rows * n_cols):
    row, col = divmod(i, n_cols)
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.subplots_adjust(top=0.95, right=0.9)  # Adjust to make room for the suptitle and legends
plt.show()

# List of categorical columns to test
categorical_columns = ['Column8', 'Column9', 'Column10', 'Column11', 'Column12', 'Column13', 'Column14', 'Column15', 'Column16', 'Column17', 'Column18', 'Column19', 'Column20', 'Column21', 'Column22']  # Replace with your categorical columns

# Target variable
target = 'TargetVariable'  # Replace with your target column name

# Perform Chi-Square test
chi2_results = {}
for column in categorical_columns:
    contingency_table = pd.crosstab(df[column], df[target])
    chi2, p, dof, ex = chi2_contingency(contingency_table)
    chi2_results[column] = {'chi2': chi2, 'p-value': p}

# Convert results to DataFrame
chi2_results_df = pd.DataFrame.from_dict(chi2_results, orient='index').reset_index()
chi2_results_df.columns = ['Feature', 'Chi2', 'p-value']

# Sort results by Chi2 score in descending order, then by p-value
chi2_results_df.sort_values(by=['Chi2', 'p-value'], ascending=[False, True], inplace=True)
chi2_results_df.reset_index(drop=True, inplace=True)

# Display the results
print(chi2_results_df)
