In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load the dataset
file_path = 'OKCupid_profiles.csv' 
okcupid_data = pd.read_csv(file_path)

# Select relevant columns
relevant_columns = ['diet', 'drinks', 'drugs', 'smokes', 'status']
okcupid_data_subset = okcupid_data[relevant_columns].copy()  

# Map `status` to binary categories: `single` -> 0, `in a relationship` -> 1
status_mapping = {
    'single': 0,
    'available': 0,  # Assuming "available" also means single
    'seeing someone': 1,
    'married': 1,
    'unknown': None  # Treating "unknown" as missing
}
okcupid_data_subset['status'] = okcupid_data_subset['status'].map(status_mapping)

# Fill missing values in lifestyle habit columns with "unknown"
okcupid_data_subset.fillna({'diet': 'unknown', 'drinks': 'unknown', 
                            'drugs': 'unknown', 'smokes': 'unknown'}, inplace=True)

# Encode `drinks`
drinks_mapping = {
    'not at all': 0,
    'rarely': 1,
    'socially': 2,
    'often': 3,
    'very often': 4,
    'desperately': 5,
    'unknown': -1
}
okcupid_data_subset.loc[:, 'drinks'] = okcupid_data_subset['drinks'].map(drinks_mapping)

# Encode `drugs`
drugs_mapping = {
    'never': 0,
    'sometimes': 1,
    'often': 2,
    'unknown': -1
}
okcupid_data_subset.loc[:, 'drugs'] = okcupid_data_subset['drugs'].map(drugs_mapping)

# Encode `smokes`
smokes_mapping = {
    'no': 0,
    'trying to quit': 1,
    'when drinking': 2,
    'sometimes': 3,
    'yes': 4,
    'unknown': -1
}
okcupid_data_subset.loc[:, 'smokes'] = okcupid_data_subset['smokes'].map(smokes_mapping)

# Simplify and encode `diet`
diet_categories = {
    'vegetarian': 1, 'vegan': 2, 'kosher': 3, 'halal': 4, 'anything': 5,
    'other': 6, 'unknown': -1
}
okcupid_data_subset.loc[:, 'diet'] = okcupid_data_subset['diet'].apply(
    lambda x: diet_categories.get(next((key for key in diet_categories if key in str(x)), 'unknown'))
)

# Split data into features and target
X = okcupid_data_subset[['diet', 'drinks', 'drugs', 'smokes']]
y = okcupid_data_subset['status']

# Handle missing target values
X = X[y.notnull()]
y = y[y.notnull()]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

okcupid_data_subset.head()


Unnamed: 0,diet,drinks,drugs,smokes,status
0,5,2,0,3,0.0
1,6,3,1,0,0.0
2,5,2,-1,0,0.0
3,1,2,-1,0,0.0
4,-1,2,0,0,0.0


In [11]:
# Works only a little bit --------------------------------------------------------------------------------------




# Train Random Forest model with class weights to handle imbalance
rf_model_balanced = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
rf_model_balanced.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred_balanced = rf_model_balanced.predict(X_test)
y_proba_balanced = rf_model_balanced.predict_proba(X_test)[:, 1]

# Generate classification report and ROC-AUC score
classification_report_result_balanced = classification_report(y_test, y_pred_balanced)
roc_auc_balanced = roc_auc_score(y_test, y_proba_balanced)

print("Classification Report:")
print(classification_report_result_balanced)
print("\nROC-AUC Score:", roc_auc_balanced)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.61      0.75     11513
         1.0       0.06      0.55      0.10       475

    accuracy                           0.61     11988
   macro avg       0.51      0.58      0.42     11988
weighted avg       0.93      0.61      0.72     11988


ROC-AUC Score: 0.6124556679634463


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the pipeline for modeling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler()),                # Standardize the data
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))  # Logistic regression
])

# Perform 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')

# Calculate mean and standard deviation of cross-validation scores
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)

# Print results
print(f"Cross-validation mean accuracy: {cv_mean:.4f}")
print(f"Cross-validation accuracy standard deviation: {cv_std:.4f}")

Cross-validation mean accuracy: 0.9604
Cross-validation accuracy standard deviation: 0.0000


In [23]:
# Investigate class distribution in the target variable
class_distribution = y.value_counts(normalize=True) * 100  # Percentage distribution

# Display class distribution
class_distribution

status
0.0    96.039108
1.0     3.960892
Name: proportion, dtype: float64

In [25]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Fit the pipeline on the resampled training data
pipeline.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Evaluate the model using precision, recall, F1-score, and ROC-AUC
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print the metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

Precision: 0.0529
Recall: 0.6232
F1 Score: 0.0975
ROC-AUC: 0.6012


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Define the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Fit the model on the oversampled training data
random_forest.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_rf = random_forest.predict(X_test)
y_pred_proba_rf = random_forest.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Evaluate the model using precision, recall, F1-score, and ROC-AUC
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_proba_rf)

# Print the metrics
print(f"Random Forest Precision: {precision_rf:.4f}")
print(f"Random Forest Recall: {recall_rf:.4f}")
print(f"Random Forest F1 Score: {f1_rf:.4f}")
print(f"Random Forest ROC-AUC: {roc_auc_rf:.4f}")

Random Forest Precision: 0.0541
Random Forest Recall: 0.5516
Random Forest F1 Score: 0.0986
Random Forest ROC-AUC: 0.5928


In [32]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Reapply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define a parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],    # Minimum number of samples required to be at a leaf node
    'class_weight': ['balanced']      # Handle class imbalance
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='f1',  # Optimize for F1 score to balance precision and recall
    n_jobs=-1,  # Use all available processors
    verbose=2
)

# Fit the grid search to the oversampled training data
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]

# Metrics for the best model
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)
roc_auc_best = roc_auc_score(y_test, y_pred_proba_best)

print(f"Precision: {precision_best:.4f}")
print(f"Recall: {recall_best:.4f}")
print(f"F1 Score: {f1_best:.4f}")
print(f"ROC-AUC: {roc_auc_best:.4f}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'class_weight': 'balanced', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
Precision: 0.0543
Recall: 0.5537
F1 Score: 0.0989
ROC-AUC: 0.5946


In [36]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Add lifestyle score and interaction terms before resampling
weights = {
    'diet': 1.0,
    'drinks': 1.5,
    'drugs': 1.2,
    'smokes': 1.3
}

X_train['lifestyle_score'] = (
    weights['diet'] * X_train['diet'] +
    weights['drinks'] * X_train['drinks'] +
    weights['drugs'] * X_train['drugs'] +
    weights['smokes'] * X_train['smokes']
)

X_train['diet_drinks'] = X_train['diet'] * X_train['drinks']
X_train['diet_drugs'] = X_train['diet'] * X_train['drugs']
X_train['drinks_drugs'] = X_train['drinks'] * X_train['drugs']
X_train['smokes_drinks'] = X_train['smokes'] * X_train['drinks']

# Add the same features for the test set
X_test['lifestyle_score'] = (
    weights['diet'] * X_test['diet'] +
    weights['drinks'] * X_test['drinks'] +
    weights['drugs'] * X_test['drugs'] +
    weights['smokes'] * X_test['smokes']
)

X_test['diet_drinks'] = X_test['diet'] * X_test['drinks']
X_test['diet_drugs'] = X_test['diet'] * X_test['drugs']
X_test['drinks_drugs'] = X_test['drinks'] * X_test['drugs']
X_test['smokes_drinks'] = X_test['smokes'] * X_test['drinks']

# Select the updated feature set
features = ['diet', 'drinks', 'drugs', 'smokes', 'lifestyle_score',
            'diet_drinks', 'diet_drugs', 'drinks_drugs', 'smokes_drinks']
X_train_extended = X_train[features]

# Apply SMOTE on the extended feature set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_extended, y_train)

# Train the Random Forest model
best_model.fit(X_train_resampled, y_train_resampled)

# Evaluate on the test set
X_test_extended = X_test[features]
y_pred_extended = best_model.predict(X_test_extended)
y_pred_proba_extended = best_model.predict_proba(X_test_extended)[:, 1]

# Calculate metrics
precision_extended = precision_score(y_test, y_pred_extended)
recall_extended = recall_score(y_test, y_pred_extended)
f1_extended = f1_score(y_test, y_pred_extended)
roc_auc_extended = roc_auc_score(y_test, y_pred_proba_extended)

# Print results
print(f"Extended Features Precision: {precision_extended:.4f}")
print(f"Extended Features Recall: {recall_extended:.4f}")
print(f"Extended Features F1 Score: {f1_extended:.4f}")
print(f"Extended Features ROC-AUC: {roc_auc_extended:.4f}")

Extended Features Precision: 0.0543
Extended Features Recall: 0.5537
Extended Features F1 Score: 0.0990
Extended Features ROC-AUC: 0.5942


In [38]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to create a balanced dataset
smote = SMOTE(sampling_strategy='auto', random_state=42)  # 'auto' balances the dataset
X_balanced, y_balanced = smote.fit_resample(X_train_extended, y_train)

# Check the class distribution after balancing
from collections import Counter
print(f"Class distribution after SMOTE: {Counter(y_balanced)}")

# Train the Random Forest model on the balanced dataset
best_model.fit(X_balanced, y_balanced)

# Evaluate on the test set
y_pred_balanced = best_model.predict(X_test_extended)
y_pred_proba_balanced = best_model.predict_proba(X_test_extended)[:, 1]

# Calculate metrics
precision_balanced = precision_score(y_test, y_pred_balanced)
recall_balanced = recall_score(y_test, y_pred_balanced)
f1_balanced = f1_score(y_test, y_pred_balanced)
roc_auc_balanced = roc_auc_score(y_test, y_pred_proba_balanced)

# Print results
print(f"Balanced Features Precision: {precision_balanced:.4f}")
print(f"Balanced Features Recall: {recall_balanced:.4f}")
print(f"Balanced Features F1 Score: {f1_balanced:.4f}")
print(f"Balanced Features ROC-AUC: {roc_auc_balanced:.4f}")

Class distribution after SMOTE: Counter({0.0: 46049, 1.0: 46049})
Balanced Features Precision: 0.0543
Balanced Features Recall: 0.5537
Balanced Features F1 Score: 0.0990
Balanced Features ROC-AUC: 0.5942
