# **Project Title: Predicting divorce**
by Kabir, Rodrigue and Sertac

**Research main objective**

This project aims to develop and validate predictive models of divorce using multidimensional determinants (demographic, socioeconomic, relational, and psychological characteristics), and to identify the key factors that most strongly contribute to marital dissolution. 

Specifically, it seeks at 

-	Training and comparing multiple supervised learning models predicting divorce
-	Identifiy the most important predictors of divorce


**Research questions**

-	Which supervised machine-learning model offers the most reliable and robust prediction of divorce?
-	Which factors contribute most to predicting divorce ?


**Methods**
-	Exploration of the dataset 
-	Preparation of the dataset
-	Training and comparison of supervised learning models
-	Identification of  the most important predictors of divorce


In [15]:
#Load libraries that will be used throughout the project (will be continuouysly updated)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# Load the dataset
df = pd.read_csv('divorce.csv')
df.head()

Unnamed: 0,age_at_marriage,marriage_duration_years,num_children,education_level,employment_status,combined_income,religious_compatibility,cultural_background_match,communication_score,conflict_frequency,...,mental_health_issues,infidelity_occurred,counseling_attended,social_support,shared_hobbies_count,marriage_type,pre_marital_cohabitation,domestic_violence_history,trust_score,divorced
0,30,1,1,Bachelor,Full-time,64001,Different Religion,Yes,5.536016,3,...,No,No,No,8.428183,5,Love,Yes,No,6.262411,1
1,27,2,2,Master,Full-time,86221,Same Religion,Yes,5.810172,3,...,No,Yes,No,5.297221,1,Love,Yes,No,6.769384,1
2,31,6,0,High School,Part-time,69441,Same Religion,No,6.088146,3,...,No,No,No,5.887066,1,Arranged,Yes,No,5.532866,1
3,35,3,2,Bachelor,Full-time,69513,Not Religious,Yes,6.212046,3,...,No,No,No,5.263555,5,Love,Yes,No,3.491264,0
4,26,2,2,No Formal Education,Full-time,63986,Different Religion,Yes,4.826262,1,...,No,No,Yes,5.771259,4,Love,Yes,No,10.0,1


In [None]:
# Make a copy of the original dataset
df1 = df.copy()
df1.head()

Unnamed: 0,age_at_marriage,marriage_duration_years,num_children,education_level,employment_status,combined_income,religious_compatibility,cultural_background_match,communication_score,conflict_frequency,...,mental_health_issues,infidelity_occurred,counseling_attended,social_support,shared_hobbies_count,marriage_type,pre_marital_cohabitation,domestic_violence_history,trust_score,divorced
0,30,1,1,Bachelor,Full-time,64001,Different Religion,Yes,5.536016,3,...,No,No,No,8.428183,5,Love,Yes,No,6.262411,1
1,27,2,2,Master,Full-time,86221,Same Religion,Yes,5.810172,3,...,No,Yes,No,5.297221,1,Love,Yes,No,6.769384,1
2,31,6,0,High School,Part-time,69441,Same Religion,No,6.088146,3,...,No,No,No,5.887066,1,Arranged,Yes,No,5.532866,1
3,35,3,2,Bachelor,Full-time,69513,Not Religious,Yes,6.212046,3,...,No,No,No,5.263555,5,Love,Yes,No,3.491264,0
4,26,2,2,No Formal Education,Full-time,63986,Different Religion,Yes,4.826262,1,...,No,No,Yes,5.771259,4,Love,Yes,No,10.0,1


In [None]:
#Show the list of columns in the dataset
df1.columns

Index(['age_at_marriage', 'marriage_duration_years', 'num_children',
       'education_level', 'employment_status', 'combined_income',
       'religious_compatibility', 'cultural_background_match',
       'communication_score', 'conflict_frequency',
       'conflict_resolution_style', 'financial_stress_level',
       'mental_health_issues', 'infidelity_occurred', 'counseling_attended',
       'social_support', 'shared_hobbies_count', 'marriage_type',
       'pre_marital_cohabitation', 'domestic_violence_history', 'trust_score',
       'divorced'],
      dtype='object')

In [None]:
#Extract the dataset to be used for the project
df3 = df1.copy()
df3 = df1[['communication_score', 'financial_stress_level', 'mental_health_issues', 'infidelity_occurred', 'social_support', 'domestic_violence_history', 'trust_score', 'divorced']]
df3.head()

Unnamed: 0,communication_score,financial_stress_level,mental_health_issues,infidelity_occurred,social_support,domestic_violence_history,trust_score,divorced
0,5.536016,6.026355,No,No,8.428183,No,6.262411,1
1,5.810172,1.0,No,Yes,5.297221,No,6.769384,1
2,6.088146,3.199275,No,No,5.887066,No,5.532866,1
3,6.212046,4.893633,No,No,5.263555,No,3.491264,0
4,4.826262,9.431154,No,No,5.771259,No,10.0,1


In [None]:
# Produce summary statistics of the dataset
df3.describe()

Unnamed: 0,communication_score,financial_stress_level,social_support,trust_score,divorced
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,6.040507,5.07145,5.966853,6.030525,0.3982
std,1.963349,2.326322,1.965504,1.932922,0.489576
min,1.0,1.0,1.0,1.0,0.0
25%,4.661929,3.365562,4.597007,4.702663,0.0
50%,6.058246,5.062464,6.010716,6.034343,0.0
75%,7.426862,6.717954,7.328916,7.37283,1.0
max,10.0,10.0,10.0,10.0,1.0


In [None]:
# Generating frequency tables for binary variables
binary_columns = ['infidelity_occurred', 'domestic_violence_history', 'divorced', "mental_health_issues"]
for col in binary_columns:
    print(f"Frequency table for {col}:")
    print(df3[col].value_counts())
    print("\n")

Frequency table for infidelity_occurred:
infidelity_occurred
No     4255
Yes     745
Name: count, dtype: int64


Frequency table for domestic_violence_history:
domestic_violence_history
No     4748
Yes     252
Name: count, dtype: int64


Frequency table for divorced:
divorced
0    3009
1    1991
Name: count, dtype: int64


Frequency table for mental_health_issues:
mental_health_issues
No     3981
Yes    1019
Name: count, dtype: int64




In [None]:
# Check for missing values
df3.isnull().sum()

communication_score          0
financial_stress_level       0
mental_health_issues         0
infidelity_occurred          0
social_support               0
domestic_violence_history    0
trust_score                  0
divorced                     0
dtype: int64

**Pre-processing and preparation of the dataset for machine learning**

In [None]:
# Make train-test split
# Define feature matrix X and target vector y
X = df3[['communication_score', 'financial_stress_level', 'mental_health_issues', 'infidelity_occurred', 'social_support', 'domestic_violence_history', 'trust_score']]
y = df3['divorced']

# 2. Perform 80/20 train-test split 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

print(f"--- Dataset Split Results ---")
print(f"Total samples: {len(df3)}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train distribution:\n{y_train.value_counts(normalize=True)}")
print(f"y_test distribution:\n{y_test.value_counts(normalize=True)}")
print(f"-----------------------------")

--- Dataset Split Results ---
Total samples: 5000
X_train shape: (4000, 7)
X_test shape: (1000, 7)
y_train distribution:
divorced
0    0.604
1    0.396
Name: proportion, dtype: float64
y_test distribution:
divorced
0    0.593
1    0.407
Name: proportion, dtype: float64
-----------------------------


In [None]:
# 2. Create preprocessing pipelines for numerical and categorical features
numerical_features = ["communication_score", "financial_stress_level", "social_support", "trust_score"]
categorical_features = ["mental_health_issues", "infidelity_occurred", "domestic_violence_history"]
numerical_transformer = StandardScaler() # Scale continuous features
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # Convert categorical features to numerical

# 3. Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

**Training and comparison of supervised models**

In [None]:
# Define divorce_model1 (Logistic Regression)
divorce_model1 = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression(random_state=1))])
# Train the model
divorce_model1.fit(X_train, y_train) 
# Predict on the test set
y_pred = divorce_model1.predict(X_test) 
# Evaluation of the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\n--- Training Model (Logistic Regression Classifier) ---")

print("Making predictions for the following 5 couples:")
print(X.head())
print("The predictions are")
print(divorce_model1.predict(X.head()))

print("\n--- Model Evaluation Results---")
print(f"Accuracy on Test Set: {accuracy:.4f}")

print("\n--- Confusion Matrix ---")
print("    Predicted 0 | Predicted 1")
print(f"Actual 0: {conf_matrix[0][0]:>10} | {conf_matrix[0][1]:>10}")
print(f"Actual 1: {conf_matrix[1][0]:>10} | {conf_matrix[1][1]:>10}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))




--- Training Model (Logistic Regression Classifier) ---
Making predictions for the following 5 couples:
   communication_score  financial_stress_level mental_health_issues  \
0             5.536016                6.026355                   No   
1             5.810172                1.000000                   No   
2             6.088146                3.199275                   No   
3             6.212046                4.893633                   No   
4             4.826262                9.431154                   No   

  infidelity_occurred  social_support domestic_violence_history  trust_score  
0                  No        8.428183                        No     6.262411  
1                 Yes        5.297221                        No     6.769384  
2                  No        5.887066                        No     5.532866  
3                  No        5.263555                        No     3.491264  
4                  No        5.771259                        No    10.000

In [None]:
# Define divorce_model1 (Random Forest Classifier)
divorce_model1 = Pipeline(steps=[('preprocessor', preprocessor),('classifier', RandomForestClassifier(random_state=1))])
# Train the model
divorce_model1.fit(X_train, y_train) 
# Predict on the test set
y_pred = divorce_model1.predict(X_test) 
# Evaluation of the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\n--- Training Model (Random Forest Classifier) ---")

print("Making predictions for the following 5 couples:")
print(X.head())
print("The predictions are")
print(divorce_model1.predict(X.head()))

print("\n--- Model Evaluation Results---")
print(f"Accuracy on Test Set: {accuracy:.4f}")

print("\n--- Confusion Matrix ---")
print("    Predicted 0 | Predicted 1")
print(f"Actual 0: {conf_matrix[0][0]:>10} | {conf_matrix[0][1]:>10}")
print(f"Actual 1: {conf_matrix[1][0]:>10} | {conf_matrix[1][1]:>10}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



--- Training Model (Random Forest Classifier) ---
Making predictions for the following 5 couples:
   communication_score  financial_stress_level mental_health_issues  \
0             5.536016                6.026355                   No   
1             5.810172                1.000000                   No   
2             6.088146                3.199275                   No   
3             6.212046                4.893633                   No   
4             4.826262                9.431154                   No   

  infidelity_occurred  social_support domestic_violence_history  trust_score  
0                  No        8.428183                        No     6.262411  
1                 Yes        5.297221                        No     6.769384  
2                  No        5.887066                        No     5.532866  
3                  No        5.263555                        No     3.491264  
4                  No        5.771259                        No    10.000000  


In [None]:
# Define divorce_model1 (Xgboost Classifier)
divorce_model1 = Pipeline(steps=[('preprocessor', preprocessor),('classifier', XGBClassifier(random_state=1))])
# Train the model
divorce_model1.fit(X_train, y_train) 
# Predict on the test set
y_pred = divorce_model1.predict(X_test) 
# Evaluation of the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\n--- Training Model (XGBoost Classifier) ---")

print("Making predictions for the following 5 couples:")
print(X.head())
print("The predictions are")
print(divorce_model1.predict(X.head()))

print("\n--- Model Evaluation Results---")
print(f"Accuracy on Test Set: {accuracy:.4f}")

print("\n--- Confusion Matrix ---")
print("    Predicted 0 | Predicted 1")
print(f"Actual 0: {conf_matrix[0][0]:>10} | {conf_matrix[0][1]:>10}")
print(f"Actual 1: {conf_matrix[1][0]:>10} | {conf_matrix[1][1]:>10}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

AttributeError: 'super' object has no attribute '__sklearn_tags__'