<a href="https://colab.research.google.com/github/Sujnan06/Complete-Python-3-Bootcamp/blob/master/Simulation%20for%20train%20test%20split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

def generate_dataset(n_samples, beta, beta_0=0):
    """
    Generate dataset (x, y) using a logit model.

    Parameters:
        n_samples (int): Number of samples to generate.
        beta (list or np.array): Coefficients [beta1, beta2, beta3, beta4, beta5].
        beta_0 (float): Intercept term beta_0.

    Returns:
        pd.DataFrame: Dataset containing predictor variables x1, x2, x3, x4, x5 and binary outcome y.
    """
    # Ensure beta is a numpy array for easier manipulation
    beta = np.array(beta)

    # Generate predictors independently according to specified distributions
    x1 = np.random.normal(loc=0, scale=1, size=n_samples)        # Standard normal distribution
    x2 = np.random.uniform(low=0, high=1, size=n_samples)        # Uniform distribution between 0 and 1
    x3 = np.random.exponential(scale=1, size=n_samples)          # Exponential distribution with rate = 1
    x4 = np.random.binomial(n=1, p=0.5, size=n_samples)          # Bernoulli distribution with p = 0.5
    x5 = np.random.poisson(lam=3, size=n_samples)                # Poisson distribution with lambda = 3

    # Combine predictors into a matrix X
    X = np.column_stack((x1, x2, x3, x4, x5))

    # Compute logit(p) = beta_0 + beta1*x1 + beta2*x2 + beta3*x3 + beta4*x4 + beta5*x5
    logit_p = beta_0 + np.dot(X, beta)

    # Convert logit to probability using the sigmoid function: p = 1 / (1 + exp(-logit_p))
    p = 1 / (1 + np.exp(-logit_p))

    # Generate binary outcome y from Bernoulli distribution with success probability p
    y = np.random.binomial(n=1, p=p)

    # Create a DataFrame for better readability
    dataset = pd.DataFrame({
        'x1': x1,
        'x2': x2,
        'x3': x3,
        'x4': x4,
        'x5': x5,
        'y': y
    })

    return dataset

# Parameters
n_samples = 1000
beta = [5, 4, 3, 2, 1]  # Coefficients for predictors
beta_0 = 0              # Intercept term

# Generate the dataset
dataset = generate_dataset(n_samples, beta, beta_0=beta_0)

# Display the first few rows of the dataset
print(dataset.head())


         x1        x2        x3  x4  x5  y
0  0.960627  0.188248  0.675273   1   2  1
1  1.715057  0.280350  0.407794   1   4  1
2  1.241315  0.940921  0.706215   0   4  1
3 -1.147444  0.589229  0.909610   0   3  1
4  0.460867  0.777761  0.126164   0   4  1


In [13]:
import numpy as np
import pandas as pd

def generate_dataset(n_samples, beta, beta_0=0, imbalance_ratio=(0.65, 0.35)):
    """
    Generate dataset (x, y) using a logit model with a specified imbalance ratio.

    Parameters:
        n_samples (int): Number of samples to generate.
        beta (list or np.array): Coefficients [beta1, beta2, beta3, beta4, beta5].
        beta_0 (float): Intercept term beta_0.
        imbalance_ratio (tuple): Proportion of majority to minority class (e.g., (0.65, 0.35)).

    Returns:
        pd.DataFrame: Dataset containing predictor variables x1, x2, x3, x4, x5 and binary outcome y.
    """
    # Ensure beta is a numpy array for easier manipulation
    beta = np.array(beta)

    # Generate predictors independently according to specified distributions
    x1 = np.random.normal(loc=0, scale=1, size=n_samples)        # Standard normal distribution
    x2 = np.random.uniform(low=0, high=1, size=n_samples)        # Uniform distribution between 0 and 1
    x3 = np.random.exponential(scale=1, size=n_samples)          # Exponential distribution with rate = 1
    x4 = np.random.binomial(n=1, p=0.5, size=n_samples)          # Bernoulli distribution with p = 0.5
    x5 = np.random.poisson(lam=3, size=n_samples)                # Poisson distribution with lambda = 3

    # Combine predictors into a matrix X
    X = np.column_stack((x1, x2, x3, x4, x5))

    # Compute logit(p) = beta_0 + beta1*x1 + beta2*x2 + beta3*x3 + beta4*x4 + beta5*x5
    logit_p = beta_0 + np.dot(X, beta)

    # Convert logit to probability using the sigmoid function: p = 1 / (1 + exp(-logit_p))
    p = 1 / (1 + np.exp(-logit_p))

    # Generate binary outcome y from Bernoulli distribution with success probability p
    y = np.random.binomial(n=1, p=p)

    # Adjust for imbalance ratio
    majority_class = 0 if imbalance_ratio[0] > imbalance_ratio[1] else 1
    minority_class = 1 - majority_class

    majority_count = int(n_samples * imbalance_ratio[0])
    minority_count = n_samples - majority_count

    # Select samples for the majority and minority classes
    majority_indices = np.where(y == majority_class)[0][:majority_count]
    minority_indices = np.where(y == minority_class)[0][:minority_count]

    selected_indices = np.concatenate([majority_indices, minority_indices])
    np.random.shuffle(selected_indices)

    y = y[selected_indices]
    X = X[selected_indices]

    # Create a DataFrame for better readability
    dataset = pd.DataFrame({
        'x1': X[:, 0],
        'x2': X[:, 1],
        'x3': X[:, 2],
        'x4': X[:, 3],
        'x5': X[:, 4],
        'y': y
    })

    return dataset

# Parameters
n_samples = 1000
beta = [5, 4, 3, 2, 1]  # Coefficients for predictors
beta_0 = 0              # Intercept term
imbalance_ratio = (0.65, 0.35)  # 65:35 imbalance ratio

# Generate the dataset
dataset = generate_dataset(n_samples, beta, beta_0=beta_0, imbalance_ratio=imbalance_ratio)

# Display the first few rows of the dataset
print(dataset.head())


         x1        x2        x3   x4   x5  y
0  2.054284  0.831488  1.846417  1.0  2.0  1
1  0.699180  0.454819  1.039166  1.0  4.0  1
2  0.153038  0.281539  0.140066  0.0  6.0  1
3 -1.286509  0.592376  0.073071  1.0  2.0  0
4 -0.892417  0.632338  0.927070  1.0  7.0  1


In [14]:
dataset['y'].value_counts(normalize=True) * 100

# Calculate and print the relative frequency distribution (percentage)
print(dataset['y'].value_counts(normalize=True) * 100)
# Count the number of values for each class in the 'y' column
class_counts = dataset['y'].value_counts()

# Print the results
class_counts

y
1    80.275229
0    19.724771
Name: proportion, dtype: float64


Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
1,350
0,86


In [16]:
# prompt: i want to do stratified train test split

from sklearn.model_selection import train_test_split

# Assuming 'dataset' is your DataFrame with 'y' as the target variable
X = dataset.drop('y', axis=1)
y = dataset['y']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Check class distribution in training and testing sets
print("\nClass distribution in y_train:")
print(y_train.value_counts(normalize=True) * 100)

print("\nClass distribution in y_test:")
print(y_test.value_counts(normalize=True) * 100)

X_train shape: (348, 5)
X_test shape: (88, 5)
y_train shape: (348,)
y_test shape: (88,)

Class distribution in y_train:
y
1    80.172414
0    19.827586
Name: proportion, dtype: float64

Class distribution in y_test:
y
1    80.681818
0    19.318182
Name: proportion, dtype: float64


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

# Step 1: Fit logistic regression model on training data
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 2: Make predictions (probabilities) on test data
pred_probs = model.predict_proba(X_test)[:, 1]

# Step 3: Convert probabilities to class labels (using 0.5 threshold)
pred_labels = (pred_probs >= 0.5).astype(int)

# Step 4: Create a confusion matrix
conf_matrix = confusion_matrix(y_test, pred_labels)
print("Confusion Matrix:\n", conf_matrix)

# Extract values from the confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Calculate accuracy, sensitivity, and specificity
accuracy = accuracy_score(y_test, pred_labels)
sensitivity = TP / (TP + FN)  # Recall
specificity = TN / (TN + FP)
specificity = TN / (TN + FP)
gmean_val = gmean([sensitivity, specificity])

# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {specificity * 100:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"G-mean: {gmean_val:.4f}")

Confusion Matrix:
 [[14  3]
 [ 4 67]]
Accuracy: 92.05%
Recall (Sensitivity): 0.9437
Specificity: 82.35%
Precision: 0.9571
F1-Score: 0.9504
ROC-AUC: 0.9693
G-mean: 0.8816


In [23]:
# prompt: use the above code but for SVC

from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, accuracy_score
from scipy.stats import gmean

# Step 1: Fit SVC model on training data
model = SVC(probability=True) #Enable probability estimates for ROC-AUC
model.fit(X_train, y_train)

# Step 2: Make predictions (probabilities) on test data
pred_probs = model.predict_proba(X_test)[:, 1]

# Step 3: Convert probabilities to class labels (using 0.5 threshold)
pred_labels = (pred_probs >= 0.5).astype(int)

# Step 4: Evaluate the model
conf_matrix = confusion_matrix(y_test, pred_labels)
print("Confusion Matrix:\n", conf_matrix)

accuracy = accuracy_score(y_test, pred_labels)
precision = precision_score(y_test, pred_labels)
recall = recall_score(y_test, pred_labels) #Sensitivity
f1 = f1_score(y_test, pred_labels)
roc_auc = roc_auc_score(y_test, pred_probs)

TN, FP, FN, TP = conf_matrix.ravel()
specificity = TN / (TN + FP)
gmean_val = gmean([recall, specificity])


print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {specificity * 100:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"G-mean: {gmean_val:.4f}")

Confusion Matrix:
 [[14  3]
 [ 5 66]]
Accuracy: 90.91%
Recall (Sensitivity): 0.9296
Specificity: 82.35%
Precision: 0.9565
F1-Score: 0.9429
ROC-AUC: 0.9619
G-mean: 0.8749


In [25]:
# prompt: same code but for KNN

from sklearn.neighbors import KNeighborsClassifier

# ... (Your existing code for data generation and splitting) ...

# Step 1: Fit KNN model on training data
model = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors
model.fit(X_train, y_train)

# Step 2: Make predictions on test data
pred_labels = model.predict(X_test)

# Step 3: Evaluate the model
conf_matrix = confusion_matrix(y_test, pred_labels)
print("Confusion Matrix:\n", conf_matrix)

accuracy = accuracy_score(y_test, pred_labels)
precision = precision_score(y_test, pred_labels)
recall = recall_score(y_test, pred_labels)
f1 = f1_score(y_test, pred_labels)

TN, FP, FN, TP = conf_matrix.ravel()
specificity = TN / (TN + FP)
gmean_val = gmean([recall, specificity])

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {specificity * 100:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"G-mean: {gmean_val:.4f}")

# You can also get predicted probabilities if your KNN model supports it
# (some implementations might not provide probabilities directly)
try:
    pred_probs = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, pred_probs)
    print(f"ROC-AUC: {roc_auc:.4f}")
except AttributeError:
    print("ROC-AUC not available for this KNN implementation.")

Confusion Matrix:
 [[13  4]
 [ 4 67]]
Accuracy: 90.91%
Recall (Sensitivity): 0.9437
Specificity: 76.47%
Precision: 0.9437
F1-Score: 0.9437
G-mean: 0.8495
ROC-AUC: 0.9408


In [26]:
# prompt: same code for decision tree

from sklearn.tree import DecisionTreeClassifier

# ... (Your existing code for data generation and splitting) ...

# Step 1: Fit Decision Tree model on training data
model = DecisionTreeClassifier(random_state=42)  # You can adjust hyperparameters
model.fit(X_train, y_train)

# Step 2: Make predictions on test data
pred_labels = model.predict(X_test)

# Step 3: Evaluate the model
conf_matrix = confusion_matrix(y_test, pred_labels)
print("Confusion Matrix:\n", conf_matrix)

accuracy = accuracy_score(y_test, pred_labels)
precision = precision_score(y_test, pred_labels)
recall = recall_score(y_test, pred_labels)
f1 = f1_score(y_test, pred_labels)

TN, FP, FN, TP = conf_matrix.ravel()
specificity = TN / (TN + FP)
gmean_val = gmean([recall, specificity])

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {specificity * 100:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"G-mean: {gmean_val:.4f}")

# Get predicted probabilities for ROC-AUC
pred_probs = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, pred_probs)
print(f"ROC-AUC: {roc_auc:.4f}")

Confusion Matrix:
 [[14  3]
 [10 61]]
Accuracy: 85.23%
Recall (Sensitivity): 0.8592
Specificity: 82.35%
Precision: 0.9531
F1-Score: 0.9037
G-mean: 0.8412
ROC-AUC: 0.8413
