In [None]:
import pandas as pd

# Read the txt file
file_path = r"D:\MLNS_Project\facebook_combined.txt\facebook_combined.txt"

# Read the data (assumed to be a space-separated edge list)
df = pd.read_csv(file_path, sep=" ", header=None, names=["source", "target"])

# Add an 'edges' column with value 1
df["edges"] = 1

# Display the first few rows of data
print(df.head())  
print(df.info())

   source  target  edges
0       0       1      1
1       0       2      1
2       0       3      1
3       0       4      1
4       0       5      1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88234 entries, 0 to 88233
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   source  88234 non-null  int64
 1   target  88234 non-null  int64
 2   edges   88234 non-null  int64
dtypes: int64(3)
memory usage: 2.0 MB
None


In [None]:
import pandas as pd
import numpy as np

# Get all unique nodes
nodes = set(df["source"]).union(set(df["target"]))

# Store the set of positive sample edges (undirected edges, ensure (a, b) and (b, a) are not duplicated)
existing_edges = set(tuple(sorted(edge)) for edge in zip(df["source"], df["target"]))

# Generate strictly 1:1 negative samples
negative_samples = set()
while len(negative_samples) < len(df):
    u, v = np.random.choice(list(nodes), 2, replace=False)  # Randomly select two different nodes
    edge = tuple(sorted((u, v)))  # Ensure (u, v) and (v, u) are considered the same edge
    if edge not in existing_edges and edge not in negative_samples:
        negative_samples.add(edge)

# Create a DataFrame for negative samples
neg_df = pd.DataFrame(list(negative_samples), columns=["source", "target"])
neg_df["edges"] = 0  # Set edge value to 0 for negative samples

# Combine positive and negative samples
df_final = pd.concat([df, neg_df], ignore_index=True)

# Display DataFrame info
print(df_final.info())

# Preview the first 10 rows
print(df_final.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176468 entries, 0 to 176467
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   source  176468 non-null  int64
 1   target  176468 non-null  int64
 2   edges   176468 non-null  int64
dtypes: int64(3)
memory usage: 4.0 MB
None
   source  target  edges
0       0       1      1
1       0       2      1
2       0       3      1
3       0       4      1
4       0       5      1
5       0       6      1
6       0       7      1
7       0       8      1
8       0       9      1
9       0      10      1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Separate positive and negative samples
df_positive = df_final[df_final["edges"] == 1]  # Positive samples
df_negative = df_final[df_final["edges"] == 0]  # Negative samples

# Split the dataset in a 9:1 ratio (maintain 1:1 negative sampling)
train_pos, test_pos = train_test_split(df_positive, test_size=0.1, random_state=42)
train_neg, test_neg = train_test_split(df_negative, test_size=0.1, random_state=42)

# Combine training and test sets
df_train = pd.concat([train_pos, train_neg], ignore_index=True)
df_test = pd.concat([test_pos, test_neg], ignore_index=True)

# Print dataset split information
print(f"Training set samples: {len(df_train)} (Positive samples: {len(train_pos)}, Negative samples: {len(train_neg)})")
print(f"Test set samples: {len(df_test)} (Positive samples: {len(test_pos)}, Negative samples: {len(test_neg)})")

# Preview data
print("\nFirst 10 rows of the training set:")
print(df_train.head(10))

print("\nFirst 10 rows of the test set:")
print(df_test.head(10))


Training set samples: 158820 (Positive samples: 79410, Negative samples: 79410)
Test set samples: 17648 (Positive samples: 8824, Negative samples: 8824)

First 10 rows of the training set:
   source  target  edges
0    1945    2071      1
1     224     322      1
2    1833    1902      1
3    3226    3390      1
4    2835    2956      1
5    1055    1585      1
6    1600    1621      1
7    2873    3022      1
8    2409    2552      1
9     128     199      1

First 10 rows of the test set:
   source  target  edges
0    1188    1366      1
1    1496    1630      1
2    1958    2645      1
3    3204    3280      1
4     946    1181      1
5    1975    2403      1
6    3713    3867      1
7    2796    3434      1
8    2332    2399      1
9    1284    1587      1


In [5]:
# Check class distribution in the training set
train_edges_counts = df_train["edges"].value_counts()
print("\nTraining set class distribution:")
print(f"Positive samples (edges=1): {train_edges_counts.get(1, 0)}")
print(f"Negative samples (edges=0): {train_edges_counts.get(0, 0)}")

# Check class distribution in the test set
test_edges_counts = df_test["edges"].value_counts()
print("\nTest set class distribution:")
print(f"Positive samples (edges=1): {test_edges_counts.get(1, 0)}")
print(f"Negative samples (edges=0): {test_edges_counts.get(0, 0)}")



Training set class distribution:
Positive samples (edges=1): 79410
Negative samples (edges=0): 79410

Test set class distribution:
Positive samples (edges=1): 8824
Negative samples (edges=0): 8824


In [None]:
# Get all unique nodes in the training and test sets
train_nodes = set(df_train["source"]).union(set(df_train["target"]))
test_nodes = set(df_test["source"]).union(set(df_test["target"]))

# Compute nodes that appear in the test set but not in the training set
new_test_nodes = test_nodes - train_nodes

# Output the number of nodes in the test set but not in the training set
print(f"\nNodes in test set but not in train set: {len(new_test_nodes)}")

# If there are such nodes, list the first 10 examples
if len(new_test_nodes) > 0:
    print("\nExample of nodes in test set but not in train set:")
    print(list(new_test_nodes)[:10])


Nodes in test set but not in train set: 0


In [None]:
import networkx as nx

# Ensure all IDs are of the same type
df_train["source"] = df_train["source"].astype(int)
df_train["target"] = df_train["target"].astype(int)
df_test["source"] = df_test["source"].astype(int)
df_test["target"] = df_test["target"].astype(int)

# Build the NetworkX graph and ensure all nodes are added
G = nx.Graph()
edges = df_train[df_train["edges"] == 1][["source", "target"]].values.tolist()
G.add_edges_from(edges)

# Add all nodes from the training set (to avoid isolated nodes not being included)
all_train_nodes = set(df_train["source"]).union(set(df_train["target"]))
G.add_nodes_from(all_train_nodes)  # Ensure all nodes appearing in the training set are added

# Compute the degree of all nodes
degree_dict = dict(G.degree())

# Check if there are still nodes from the test set not in G
missing_nodes_in_G = [node for node in df_test["source"].unique() if node not in G] + \
                     [node for node in df_test["target"].unique() if node not in G]

print(f"\nNodes in test set but not in G: {len(missing_nodes_in_G)}")

# If there are still missing nodes, add them manually
if len(missing_nodes_in_G) > 0:
    G.add_nodes_from(missing_nodes_in_G)
    print("Missing nodes have been added to the graph.")





Nodes in test set but not in G: 0


In [None]:
import networkx as nx
import numpy as np
import pandas as pd

# Compute the number of common friends (Common Neighbors)
def common_neighbors(u, v):
    if G.has_node(u) and G.has_node(v):
        return len(list(nx.common_neighbors(G, u, v)))
    return 0

# Compute the density among common neighbors
def common_neighbors_density(u, v):
    common = set(G.neighbors(u)) & set(G.neighbors(v))
    if not common:
        return 0  # No common neighbors
    # Compute the connection density among common neighbors
    total_possible_edges = len(common) * (len(common) - 1) / 2
    actual_edges = sum(1 for node in common for neighbor in G.neighbors(node) if neighbor in common) / 2
    return actual_edges / total_possible_edges if total_possible_edges > 0 else 0

# Compute the density among the union of neighbors
def union_neighbors_density(u, v):
    union = set(G.neighbors(u)) | set(G.neighbors(v))
    if not union:
        return 0  # No neighbors
    total_possible_edges = len(union) * (len(union) - 1) / 2
    actual_edges = sum(1 for node in union for neighbor in G.neighbors(node) if neighbor in union) / 2
    return actual_edges / total_possible_edges if total_possible_edges > 0 else 0

# Compute a global measure of Jaccard similarity
def jaccard_global_index(u, v):
    deg_u = degree_dict.get(u, 0)
    deg_v = degree_dict.get(v, 0)
    return 1 / (deg_u + deg_v - 1) if (deg_u + deg_v - 1) > 0 else 0

# Define feature engineering function
def add_new_features(df):
    df["common_neighbors"] = df.apply(lambda row: common_neighbors(row["source"], row["target"]), axis=1)
    df["common_neighbors_density"] = df.apply(lambda row: common_neighbors_density(row["source"], row["target"]), axis=1)
    df["union_neighbors_density"] = df.apply(lambda row: union_neighbors_density(row["source"], row["target"]), axis=1)
    df["jaccard_global_index"] = df.apply(lambda row: jaccard_global_index(row["source"], row["target"]), axis=1)
    return df

# Apply to training and test sets
df_train = add_new_features(df_train)
df_test = add_new_features(df_test)

# View new features
print("\nUpdated training set with new features:")
print(df_train.head())

print("\nUpdated test set with new features:")
print(df_test.head())




Updated training set with new features:
   source  target  edges  common_neighbors  common_neighbors_density  \
0    1945    2071      1                79                  0.684843   
1     224     322      1                11                  0.618182   
2    1833    1902      1                73                  0.616058   
3    3226    3390      1                 6                  0.866667   
4    2835    2956      1                28                  0.613757   

   union_neighbors_density  jaccard_global_index  
0                 0.503060              0.004274  
1                 0.271228              0.011628  
2                 0.408445              0.003436  
3                 0.487551              0.010989  
4                 0.431563              0.007692  

Updated test set with new features:
   source  target  edges  common_neighbors  common_neighbors_density  \
0    1188    1366      1                21                  0.642857   
1    1496    1630      1               

# Model training - Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Split training & validation sets
X = df_train.drop(columns=["source", "target", "edges"])  # Features
y = df_train["edges"]  # Target variable

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% training, 20% validation

# Train Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees
clf.fit(X_train, y_train)

# Predict on validation set
y_val_pred = clf.predict(X_val)
y_val_pred_proba = clf.predict_proba(X_val)[:, 1]  # Prediction probabilities (for ROC-AUC)

# Compute validation metrics
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred)
recall_val = recall_score(y_val, y_val_pred)
f1_val = f1_score(y_val, y_val_pred)


In [None]:
# Predict on the test set
X_test = df_test.drop(columns=["source", "target", "edges"])
y_test = df_test["edges"]

y_test_pred = clf.predict(X_test)
y_test_pred_proba = clf.predict_proba(X_test)[:, 1]

# Compute test set metrics
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print metrics for validation and test sets
print("\n **Validation Set Metrics:**")
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"Accuracy: {accuracy_val:.4f}")
print(f"Precision: {precision_val:.4f}")
print(f"Recall: {recall_val:.4f}")
print(f"F1 Score: {f1_val:.4f}")

print("\n **Test Set Metrics:**")
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1 Score: {f1_test:.4f}")



 **Validation Set Metrics:**
ROC-AUC Score: 0.9926
Accuracy: 0.9720
Precision: 0.9665
Recall: 0.9775
F1 Score: 0.9720

 **Test Set Metrics:**
ROC-AUC Score: 0.9909
Accuracy: 0.9686
Precision: 0.9639
Recall: 0.9737
F1 Score: 0.9688


# Model training - XGboost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Extract features and labels
X = df_train.drop(columns=["source", "target", "edges"])  # Features
y = df_train["edges"]

X_test = df_test.drop(columns=["source", "target", "edges"])
y_test = df_test["edges"]

# Split validation set from training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize and train the model
model = XGBClassifier(eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate on the validation set
y_val_pred = model.predict(X_val)
y_val_proba = model.predict_proba(X_val)[:, 1]

print("\n**Validation Set Metrics:**")
print(f"ROC-AUC: {roc_auc_score(y_val, y_val_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred):.4f}")
print(f"F1 Score: {f1_score(y_val, y_val_pred):.4f}")



**Validation Set Metrics:**
ROC-AUC: 0.9948
Accuracy: 0.9742
Precision: 0.9718
Recall: 0.9769
F1 Score: 0.9743


In [None]:
# Predict and evaluate on the test set
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

print("\n**Test Set Metrics:**")
print(f"ROC-AUC: {roc_auc_score(y_test, y_test_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred):.4f}")



**Test Set Metrics:**
ROC-AUC: 0.9948
Accuracy: 0.9751
Precision: 0.9740
Recall: 0.9762
F1 Score: 0.9751


# Add Circle-aware features

In [None]:
import os

# Set the path to the `facebook` directory
folder_path = r"D:\MLNS_Project\facebook"

# Store all users' circle information
circle_dict = {}

# Iterate over all `*.circles` files in the `facebook` directory
for file in os.listdir(folder_path):
    if file.endswith(".circles"):  # Only process `.circles` files
        ego_id = file.split(".")[0]  # Get the Ego ID from the file name
        file_path = os.path.join(folder_path, file)
        
        # Read the contents of the `circles` file
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split()
                circle_name = f"{ego_id}_{parts[0]}"  # **Ensure the circle name is unique**
                members = set(map(int, parts[1:]))  # The rest are members of the circle
                
                # Record circle information
                for member in members:
                    if member not in circle_dict:
                        circle_dict[member] = set()
                    circle_dict[member].add(circle_name)  # Store the unique circle name

print(f"Successfully loaded {len(circle_dict)} users' circle information!")


Successfully loaded 2884 users' circle information!


In [14]:
from pprint import pprint

pprint(circle_dict)

{0: {'107_circle3'},
 1: {'0_circle15'},
 2: {'0_circle10'},
 3: {'0_circle15'},
 5: {'0_circle16'},
 6: {'0_circle15'},
 7: {'0_circle15'},
 9: {'0_circle15', '0_circle16'},
 10: {'0_circle15'},
 12: {'0_circle21'},
 13: {'0_circle11'},
 14: {'0_circle10'},
 15: {'0_circle15'},
 16: {'0_circle15'},
 17: {'0_circle19', '0_circle6'},
 20: {'0_circle19', '0_circle6'},
 21: {'0_circle15'},
 22: {'0_circle15'},
 23: {'0_circle15', '0_circle5'},
 24: {'0_circle16'},
 25: {'0_circle15'},
 26: {'0_circle15'},
 28: {'0_circle23'},
 29: {'0_circle0'},
 31: {'0_circle15'},
 32: {'0_circle6'},
 33: {'0_circle19'},
 34: {'348_circle11', '348_circle4', '0_circle16'},
 35: {'0_circle6'},
 36: {'0_circle15', '0_circle16'},
 37: {'0_circle16'},
 38: {'0_circle15'},
 39: {'0_circle15'},
 40: {'0_circle15'},
 41: {'0_circle19', '0_circle6'},
 42: {'0_circle10'},
 44: {'0_circle19'},
 45: {'0_circle15'},
 46: {'0_circle7'},
 47: {'0_circle15'},
 48: {'0_circle16'},
 49: {'0_circle15'},
 50: {'0_circle15'

In [None]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict

# Calculate the number of shared circles between users u and v
def get_circle_overlap(u, v):
    """Calculate the number of shared social circles between users u and v"""
    if u in circle_dict and v in circle_dict:
        return len(circle_dict[u] & circle_dict[v])  # Compute the number of shared circles
    return 0

# Calculate the number of circles each user belongs to
user_circle_count = {user: len(circles) for user, circles in circle_dict.items()}

# Calculate the size of each circle
circle_size = defaultdict(int)
for user, circles in circle_dict.items():
    for circle in circles:
        circle_size[circle] += 1

# Calculate the Jaccard similarity between users u and v based on their circles
def circle_jaccard_similarity(u, v):
    """Compute the Jaccard similarity between users u and v (based on social circles)"""
    if u in circle_dict and v in circle_dict:
        intersection = len(circle_dict[u] & circle_dict[v])
        union = len(circle_dict[u] | circle_dict[v])
        return intersection / union if union > 0 else 0
    return 0

# Determine whether two users are in the same "core" circle
def has_shared_core_circle(u, v):
    """Check whether u and v are in the same core circle (core circle: size > 10)"""
    if u in circle_dict and v in circle_dict:
        shared_circles = circle_dict[u] & circle_dict[v]
        return int(any(circle_size.get(circle, 0) > 10 for circle in shared_circles))  # Check circle size directly
    return 0

# Calculate the Adamic-Adar Index between two users (based on circles)
def circle_adamic_adar_index(u, v):
    """Compute the Adamic-Adar Index between u and v"""
    if u in circle_dict and v in circle_dict:
        shared_circles = circle_dict[u] & circle_dict[v]
        return sum(1 / np.log(circle_size.get(circle, 2)) for circle in shared_circles if circle_size.get(circle, 2) > 1)
    return 0


In [None]:
# Define a function to add circle-based features
def add_circle_features(df):
    df["circle_overlap"] = df.apply(lambda row: get_circle_overlap(row["source"], row["target"]), axis=1)
    df["circle_jaccard_similarity"] = df.apply(lambda row: circle_jaccard_similarity(row["source"], row["target"]), axis=1)
    df["has_shared_core_circle"] = df.apply(lambda row: has_shared_core_circle(row["source"], row["target"]), axis=1)
    df["circle_adamic_adar_index"] = df.apply(lambda row: circle_adamic_adar_index(row["source"], row["target"]), axis=1)
    df["source_circle_count"] = df["source"].map(user_circle_count).fillna(0)
    df["target_circle_count"] = df["target"].map(user_circle_count).fillna(0)
    return df

# Apply feature engineering to training and test sets
df_train = add_circle_features(df_train)
df_test = add_circle_features(df_test)

# Print information of the feature-enhanced datasets
print("\nUpdated training set:")
print(df_train.head())

print("\nUpdated test set:")
print(df_test.head())



Updated training set:
   source  target  edges  common_neighbors  common_neighbors_density  \
0    1945    2071      1                79                  0.684843   
1     224     322      1                11                  0.618182   
2    1833    1902      1                73                  0.616058   
3    3226    3390      1                 6                  0.866667   
4    2835    2956      1                28                  0.613757   

   union_neighbors_density  jaccard_global_index  circle_overlap  \
0                 0.503060              0.004274               1   
1                 0.271228              0.011628               1   
2                 0.408445              0.003436               1   
3                 0.487551              0.010989               1   
4                 0.431563              0.007692               1   

   circle_jaccard_similarity  has_shared_core_circle  \
0                        0.5                       1   
1                      

# Model training - Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Split training & validation sets
X = df_train.drop(columns=["source", "target", "edges"])  # Features
y = df_train["edges"]  # Target variable

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% training, 20% validation

# Train Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees
clf.fit(X_train, y_train)

# Predict on validation set
y_val_pred = clf.predict(X_val)
y_val_pred_proba = clf.predict_proba(X_val)[:, 1]  # Predicted probabilities (for ROC-AUC)

# Compute validation metrics
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred)
recall_val = recall_score(y_val, y_val_pred)
f1_val = f1_score(y_val, y_val_pred)


In [None]:
# Predict on the test set
X_test = df_test.drop(columns=["source", "target", "edges"])
y_test = df_test["edges"]

y_test_pred = clf.predict(X_test)
y_test_pred_proba = clf.predict_proba(X_test)[:, 1]

# Compute test set metrics
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print metrics for validation and test sets
print("\n **Validation Set Metrics:**")
print(f"ROC-AUC Score: {roc_auc_val:.4f}")
print(f"Accuracy: {accuracy_val:.4f}")
print(f"Precision: {precision_val:.4f}")
print(f"Recall: {recall_val:.4f}")
print(f"F1 Score: {f1_val:.4f}")

print("\n **Test Set Metrics:**")
print(f"ROC-AUC Score: {roc_auc_test:.4f}")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1 Score: {f1_test:.4f}")



 **Validation Set Metrics:**
ROC-AUC Score: 0.9941
Accuracy: 0.9748
Precision: 0.9689
Recall: 0.9805
F1 Score: 0.9747

 **Test Set Metrics:**
ROC-AUC Score: 0.9929
Accuracy: 0.9717
Precision: 0.9659
Recall: 0.9779
F1 Score: 0.9718


# Model training - XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

#  Extract features and labels
X = df_train.drop(columns=["source", "target", "edges"])  # Features
y = df_train["edges"]

X_test = df_test.drop(columns=["source", "target", "edges"])
y_test = df_test["edges"]

#  Split validation set from training data (e.g., 20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#  Initialize and train the model
model = XGBClassifier(eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

#  Predict and evaluate on the validation set
y_val_pred = model.predict(X_val)
y_val_proba = model.predict_proba(X_val)[:, 1]

print("\n**Validation Set Metrics:**")
print(f"ROC-AUC: {roc_auc_score(y_val, y_val_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred):.4f}")
print(f"F1 Score: {f1_score(y_val, y_val_pred):.4f}")




**Validation Set Metrics:**
ROC-AUC: 0.9955
Accuracy: 0.9751
Precision: 0.9710
Recall: 0.9795
F1 Score: 0.9752


In [None]:
# Predict and evaluate on the test set
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

print("\n**Test Set Metrics:**")
print(f"ROC-AUC: {roc_auc_score(y_test, y_test_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred):.4f}")




**Test Set Metrics:**
ROC-AUC: 0.9957
Accuracy: 0.9757
Precision: 0.9731
Recall: 0.9785
F1 Score: 0.9758
