Link to Datafile

In [2]:
datalink = "https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/Employee_Attrition_Data_Set.csv"

Data Load & Null Checks

In [3]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate


# Read the data file
df = pd.read_csv(datalink)

# Check for null values
null_counts = df.isnull().sum()
null_percentages = 100 * df.isnull().sum() / len(df)

# Combine the counts and percentages into a single dataframe
null_table = pd.concat([null_counts, null_percentages], axis=1, keys=['Null Count', 'Null Percentage'])

# Sort the table by null count in descending order
null_table = null_table.sort_values('Null Count', ascending=False)

# Display only columns with null values
print(null_table[null_table['Null Count'] > 0])

# If there are no null values, print a message
if null_table['Null Count'].sum() == 0:
    print("There are no null values in the dataset.")

# Quick summary of data
def summarize_dataframe(df):
    summary = pd.DataFrame({
        'Data Type': df.dtypes,
        'Non-Null Count': df.notnull().sum(),
        'Null Count': df.isnull().sum(),
        'Unique Values': df.nunique(),
        'First Value': df.iloc[0],
        'Second Value': df.iloc[1],
        'Third Value': df.iloc[2]
    })
    
    summary['Null Percentage'] = (100 * summary['Null Count'] / len(df)).round(1)
    
    return summary

# Get the summary of the DataFrame
summary_df = df.describe(include='all').transpose()

# Add additional information
summary_df['Null Count'] = df.isnull().sum()
summary_df['Null Percentage'] = (100 * df.isnull().sum() / len(df)).round(1)
summary_df['Data Type'] = df.dtypes

# Reorder columns for readability
summary_df = summary_df[['Data Type', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'Null Count', 'Null Percentage']]

print(summary_df)

Empty DataFrame
Columns: [Null Count, Null Percentage]
Index: []
There are no null values in the dataset.
                      Data Type    count       mean          std     min  \
Employee ID               int64  10000.0     5000.5   2886.89568     1.0   
Age                       int64  10000.0    40.5612    10.876483    22.0   
Gender                   object    10000        NaN          NaN     NaN   
Job Role                 object    10000        NaN          NaN     NaN   
Department               object    10000        NaN          NaN     NaN   
Monthly Income            int64  10000.0  8948.7503  3473.354793  3000.0   
Years at Company          int64  10000.0    14.3111     8.742572     0.0   
Number of Promotions      int64  10000.0     1.9583     1.426171     0.0   
Last Raise Percentage   float64  10000.0  10.022797     5.823696     0.0   
Distance from Office    float64  10000.0  26.632481    14.396393     1.0   
Job Satisfaction          int64  10000.0      5.407     2.

Drop Columns

In [4]:
# Drop unnecessary Employee ID Column
columns_to_drop = ['Employee ID'] 
df.drop(columns=columns_to_drop, inplace=True)

Renaming Columns for Ease of Use

In [None]:
# Rename specific columns
df = df.rename(columns={'old_name1': 'new_name1', 'old_name2': 'new_name2'})

Replacing Nulls with Mean of Column

In [None]:
# Define the columns you want to clean
columns_to_clean = ['COLUMN_NAME1', 'COLUMN_NAME2', 'COLUMN_NAME3']  # Replace with the names of the columns you want to clean

# Iterate through each column and replace null values with the mean
for column in columns_to_clean:
    mean_value = df[column].mean()
    df[column].fillna(mean_value, inplace=True)

Winsorize some Column with Bad Outliers

In [None]:
# Define the columns you want to winsorize
columns_to_winsorize = ['COLUMN_NAME1', 'COLUMN_NAME2', 'COLUMN_NAME3']  # Replace with the names of the columns you want to winsorize

# Define the proportion of data to winsorize
winsorize_limits = 0.05  # Replace with your desired limit (e.g., 0.05 for 5%)

# Apply winsorization to each specified column
for column in columns_to_winsorize:
    df[column] = winsorize(df[column], limits=winsorize_limits)

Removing Nulls

In [None]:
# Define the columns to clean
columns_to_clean = ['COLUMN_NAME1', 'COLUMN_NAME2', 'COLUMN_NAME3']  # Replace 'COLUMN_NAME' with the name of the column you want to clean

# Remove rows with null values in the specified columns and overwrite the original DataFrame
df = df.dropna(subset=columns_to_clean)

Binning & Distribution

In [None]:
# Set plot style
sns.set_theme(style="whitegrid")

# Function to format y-axis labels with "K" for thousands
def format_yaxis(ax):
    ylabels = ax.get_yticks()
    ax.set_yticks(ylabels)
    ax.set_yticklabels([f'{int(y/1000)}K' for y in ylabels])

# Function to bin data
def bin_data(data, bins, labels):
    return pd.cut(data, bins=bins, labels=labels, include_lowest=True)

# Bin Column1, Column2, and Column3
df['GenHelth_binned'] = bin_data(df['GenHlth'], 
                                 bins=[0, 1, 5, 10, 15, 30], 
                                 labels=['0', '1-5', '6-10', '11-15', '16+'])


# 1. Distribution of the target variable
plt.figure(figsize=(8, 6))
ax = sns.countplot(x='Diabetes_binary', data=df)  # Replace 'TargetVariable' with your target column name
format_yaxis(ax)
plt.title('Distribution of Target Variable', fontweight='bold')
plt.xlabel('Target Variable (Description)', fontweight='bold')  # Replace description with relevant information
plt.ylabel('Count', fontweight='bold')
plt.show()

# 2. Distribution of numerical features
numerical_features = ['BMI','PhysHlth','MentHlth']  # Replace with your numerical column names
df[numerical_features].hist(bins=15, figsize=(15, 10), layout=(3, 3))
plt.suptitle('Distribution of Numerical Features', fontweight='bold')
for ax in plt.gcf().axes:
    format_yaxis(ax)
plt.show()

# 3. Distribution of binary features
binary_features = ['Smoker', 'Fruits', 'Stroke']  # Replace with your binary column names
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 16))
for i, feature in enumerate(binary_features):
    row, col = divmod(i, 3)
    ax = sns.countplot(x=feature, data=df, ax=axes[row, col])
    format_yaxis(ax)
    axes[row, col].set_title(f'Distribution of {feature}', fontweight='bold')
plt.suptitle('Distribution of Binary Features', fontweight='bold')
plt.tight_layout()
plt.show()

Further EDA

In [None]:
# 4. Correlation matrix
# Select only numeric columns for correlation
numeric_columns = df.select_dtypes(include=[np.number]).columns
corr_df = df[numeric_columns]

plt.figure(figsize=(15, 10))
corr = corr_df.corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix', fontweight='bold')
plt.show()

# 5. Relationships between features and the target variable
# Update numerical_features list with binned versions
numerical_features = ['Column1_binned', 'Column2', 'Column2_binned', 'Column3_binned', 'Column4', 'Column5', 'Column6']  # Replace with your binned and other numerical columns
binary_features = ['Column7', 'Column8']  # Replace with your binary columns

# Calculate the number of rows and columns for the subplot grid
n_numerical = len(numerical_features)
n_binary = len(binary_features)
n_cols = 4
n_rows = -(-n_numerical // n_cols) + -(-n_binary // n_cols)  # Ceiling division

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
fig.suptitle('Relationships between Features and Target Variable', fontsize=16, fontweight='bold')

# Plot relationships for numerical features
for i, feature in enumerate(numerical_features):
    row, col = divmod(i, n_cols)
    sns.countplot(x=feature, hue='TargetVariable', data=df, ax=axes[row, col])  # Replace 'TargetVariable' with your target column name
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Number of Individuals')
    axes[row, col].legend(title='Target Variable', labels=['No', 'Yes'])  # Replace labels if needed
    axes[row, col].tick_params(axis='x', rotation=45)
    format_yaxis(axes[row, col])  # Ensure you have the format_yaxis function defined

# Plot relationships for binary features
start_row = -(-n_numerical // n_cols)  # Ceiling division
for i, feature in enumerate(binary_features):
    row, col = divmod(i, n_cols)
    row += start_row
    sns.countplot(x=feature, hue='TargetVariable', data=df, ax=axes[row, col])  # Replace 'TargetVariable' with your target column name
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Number of Individuals')
    axes[row, col].legend(title='Target Variable', labels=['No', 'Yes'])  # Replace labels if needed
    format_yaxis(axes[row, col])  # Ensure you have the format_yaxis function defined

# Remove any unused subplots
for i in range(n_numerical + n_binary, n_rows * n_cols):
    row, col = divmod(i, n_cols)
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.subplots_adjust(top=0.95)  # Adjust to make room for the suptitle
plt.show()

# 6. Relationships between features and the target variable (Percentage Stacked Bar Charts)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(24, 5*n_rows))  # Increased figure width
fig.suptitle('Relationships between Features and Target Variable (Percentage)', fontsize=16, fontweight='bold')

# Function to create percentage stacked bar chart
def percentage_stacked_bar(feature, ax):
    # Calculate percentages
    percentages = df.groupby(feature, observed=True)['TargetVariable'].value_counts(normalize=True).unstack()  # Replace 'TargetVariable' with your target column name
    # Plot stacked bar chart
    percentages.plot(kind='bar', stacked=True, ax=ax, width=0.8)
    ax.set_ylim(0, 1)
    ax.set_ylabel('Percentage')
    ax.set_title(f'{feature}', fontweight='bold')
    ax.tick_params(axis='x', rotation=45)
    
    # Move legend outside the plot
    ax.legend(title='Target Variable', labels=['No', 'Yes'], bbox_to_anchor=(1.05, 1), loc='upper left')  # Replace labels if needed

# Plot relationships for numerical features
for i, feature in enumerate(numerical_features):
    row, col = divmod(i, n_cols)
    percentage_stacked_bar(feature, axes[row, col])

# Plot relationships for binary features
start_row = -(-n_numerical // n_cols)  # Ceiling division
for i, feature in enumerate(binary_features):
    row, col = divmod(i, n_cols)
    row += start_row
    percentage_stacked_bar(feature, axes[row, col])

# Remove any unused subplots
for i in range(n_numerical + n_binary, n_rows * n_cols):
    row, col = divmod(i, n_cols)
    fig.delaxes(axes[row, col])

plt.tight_layout()
plt.subplots_adjust(top=0.95, right=0.9)  # Adjust to make room for the suptitle and legends
plt.show()

# List of categorical columns to test
categorical_columns = ['Column8', 'Column9', 'Column10', 'Column11', 'Column12', 'Column13', 'Column14', 'Column15', 'Column16', 'Column17', 'Column18', 'Column19', 'Column20', 'Column21', 'Column22']  # Replace with your categorical columns

# Target variable
target = 'TargetVariable'  # Replace with your target column name

# Perform Chi-Square test
chi2_results = {}
for column in categorical_columns:
    contingency_table = pd.crosstab(df[column], df[target])
    chi2, p, dof, ex = chi2_contingency(contingency_table)
    chi2_results[column] = {'chi2': chi2, 'p-value': p}

# Convert results to DataFrame
chi2_results_df = pd.DataFrame.from_dict(chi2_results, orient='index').reset_index()
chi2_results_df.columns = ['Feature', 'Chi2', 'p-value']

# Sort results by Chi2 score in descending order, then by p-value
chi2_results_df.sort_values(by=['Chi2', 'p-value'], ascending=[False, True], inplace=True)
chi2_results_df.reset_index(drop=True, inplace=True)

# Display the results
print(chi2_results_df)

Chi-Squared Test

In [None]:
# Function to identify categorical columns
def get_categorical_columns(df):
    return df.select_dtypes(include=['object', 'category']).columns.tolist()

# Identify categorical columns in the DataFrame
categorical_columns = get_categorical_columns(df)

# Specify your target variable column name
target_variable = 'TargetVariable'  # Replace with your actual target variable column name

# Ensure the target variable is categorical
if df[target_variable].dtype != 'object' and not pd.api.types.is_categorical_dtype(df[target_variable]):
    df[target_variable] = df[target_variable].astype('category')

# Perform Chi-Square test for each categorical column against the target variable
chi2_results = {}
for column in categorical_columns:
    if column != target_variable:  # Skip the target variable itself
        contingency_table = pd.crosstab(df[column], df[target_variable])
        chi2, p, dof, ex = chi2_contingency(contingency_table)
        chi2_results[column] = {'chi2': chi2, 'p-value': p}

# Convert results to DataFrame
chi2_results_df = pd.DataFrame.from_dict(chi2_results, orient='index').reset_index()
chi2_results_df.columns = ['Feature', 'Chi2', 'p-value']

# Sort results by Chi2 score in descending order, then by p-value
chi2_results_df.sort_values(by=['Chi2', 'p-value'], ascending=[False, True], inplace=True)
chi2_results_df.reset_index(drop=True, inplace=True)

# Display the results
print(chi2_results_df)

Convert Categorical to 0 and 1 (May not be Needed)

In [None]:
import pandas as pd

# Example DataFrame
data = {'Column1': [10, 5, 0, 8, 12]}
df = pd.DataFrame(data)

# Convert values > 0 to 1, and 0 or less to 0
df['Column1_categorical'] = df['Column1'].apply(lambda x: 1 if x > 0 else 0)

print(df)


import pandas as pd

# Example DataFrame
data = {'text_column': ['apple', 'banana', 'cherry', 'apple']}
df = pd.DataFrame(data)

# List of values to match
values_to_match = ['apple', 'cherry']

# Convert text values to 1 if they match any value in the list, else 0
df['binary_column'] = df['text_column'].apply(lambda x: 1 if x in values_to_match else 0)

print(df)

Feature Engineering

In [None]:
# Feature Engineered Columns
df['Column1'] = (df['Column1'] + df['Column1'] + df['Column1'] + df['Column1']) / 4
df['Column1'] = (df['Column1'] + df['Column1'] + df['Column1'] + df['Column1']) / 4



Modelling I

In [None]:
# Define feature columns and target variable
feature_columns = ['feature1', 'feature2', 'feature3'] # Replace with your actual feature columns
target_variable = 'Target Variable'

# Split the data
X = df[feature_columns]
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()
}

# Initialize a list to store the results
results = []

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results.append([model_name, accuracy, precision, recall, f1])

# Create a DataFrame to display the results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Print the results in a tabular format
print(tabulate(results_df, headers='keys', tablefmt='psql'))

In [None]:
# Create a DataFrame to display the results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1'])

# Print the results in a tabular format
print(tabulate(results_df, headers='keys', tablefmt='psql'))

Full Output

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns

# Load your data
# df = pd.read_csv('your_data.csv') # Uncomment and replace with your file path

# Define feature columns and target variable
feature_columns = ['feature1', 'feature2', 'feature3'] # Replace with your actual feature columns
target_variable = 'Target Variable'

# Split the data
X = df[feature_columns]
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()
}

# Initialize a list to store the results
results = []

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision_0 = precision_score(y_test, y_pred, pos_label=0)
    precision_1 = precision_score(y_test, y_pred, pos_label=1)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    recall_1 = recall_score(y_test, y_pred, pos_label=1)
    f1_0 = f1_score(y_test, y_pred, pos_label=0)
    f1_1 = f1_score(y_test, y_pred, pos_label=1)
    
    results.append([model_name, accuracy, precision_0, recall_0, f1_0, precision_1, recall_1, f1_1])

    # Print the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1], yticklabels=[0, 1])
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Print the classification report
    print(f'{model_name} Classification Report:')
    print(classification_report(y_test, y_pred))

# Create a DataFrame to display the results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1'])

# Print the results in a tabular format
print(tabulate(results_df, headers='keys', tablefmt='psql'))

# Optionally, save the results to a CSV file
# results_df.to_csv('model_results.csv', index=False) # Uncomment to save results to a file


Just Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load your data
# df = pd.read_csv('your_data.csv') # Uncomment and replace with your file path

# Define feature columns and target variable
feature_columns = ['feature1', 'feature2', 'feature3'] # Replace with your actual feature columns
target_variable = 'Target Variable'

# Split the data
X = df[feature_columns]
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate class weights to give more weight to the minority class
class_counts = y_train.value_counts()
minority_class = class_counts.idxmin()
majority_class = class_counts.idxmax()

# Assign weights (e.g., give the minority class 2x the weight of the majority class)
class_weight = {majority_class: 1, minority_class: 2}

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision_0 = precision_score(y_test, y_pred, pos_label=0)
precision_1 = precision_score(y_test, y_pred, pos_label=1)
recall_0 = recall_score(y_test, y_pred, pos_label=0)
recall_1 = recall_score(y_test, y_pred, pos_label=1)
f1_0 = f1_score(y_test, y_pred, pos_label=0)
f1_1 = f1_score(y_test, y_pred, pos_label=1)

# Print the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision for class 0: {precision_0:.4f}')
print(f'Recall for class 0: {recall_0:.4f}')
print(f'F1 Score for class 0: {f1_0:.4f}')
print(f'Precision for class 1: {precision_1:.4f}')
print(f'Recall for class 1: {recall_1:.4f}')
print(f'F1 Score for class 1: {f1_1:.4f}')

# Print the confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Print the classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

One with Threshold Adjustment

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load your data
# df = pd.read_csv('your_data.csv') # Uncomment and replace with your file path

# Define feature columns and target variable
feature_columns = ['feature1', 'feature2', 'feature3'] # Replace with your actual feature columns
target_variable = 'Target Variable'

# Split the data
X = df[feature_columns]
y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

# Get predicted probabilities
y_probs = model.predict_proba(X_test)[:, 1]

# Set a custom threshold
threshold = 0.3 # Adjust the threshold value as needed
y_pred = (y_probs >= threshold).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision_0 = precision_score(y_test, y_pred, pos_label=0)
precision_1 = precision_score(y_test, y_pred, pos_label=1)
recall_0 = recall_score(y_test, y_pred, pos_label=0)
recall_1 = recall_score(y_test, y_pred, pos_label=1)
f1_0 = f1_score(y_test, y_pred, pos_label=0)
f1_1 = f1_score(y_test, y_pred, pos_label=1)

# Print the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision for class 0: {precision_0:.4f}')
print(f'Recall for class 0: {recall_0:.4f}')
print(f'F1 Score for class 0: {f1_0:.4f}')
print(f'Precision for class 1: {precision_1:.4f}')
print(f'Recall for class 1: {recall_1:.4f}')
print(f'F1 Score for class 1: {f1_1:.4f}')

# Print the confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Print the classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))
