In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

df= pd.read_csv("data_modified.csv")
Y = df.iloc[:, 1] 
X = df.drop(df.columns[[0,1]], axis=1)

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   radius_mean              569 non-null    float64
 1   texture_mean             569 non-null    float64
 2   perimeter_mean           569 non-null    float64
 3   area_mean                569 non-null    float64
 4   smoothness_mean          569 non-null    float64
 5   compactness_mean         569 non-null    float64
 6   concavity_mean           569 non-null    float64
 7   concave points_mean      569 non-null    float64
 8   symmetry_mean            569 non-null    float64
 9   fractal_dimension_mean   569 non-null    float64
 10  radius_se                569 non-null    float64
 11  texture_se               569 non-null    float64
 12  perimeter_se             569 non-null    float64
 13  area_se                  569 non-null    float64
 14  smoothness_se            5

In [5]:
Y

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64

In [None]:
def calculate_metrics(predictions, Y):
    TP = ((predictions == 1) & (Y == 1)).sum()
    FP = ((predictions == 1) & (Y == 0)).sum()
    TN = ((predictions == 0) & (Y == 0)).sum()
    FN = ((predictions == 0) & (Y == 1)).sum()

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    recall = TP / (TP + FN) if TP + FN != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0

    return accuracy, precision, recall, f1

  # Forward Selection

In [102]:
def RM1(X, Y):
    variables = X.columns.tolist()
    selected_variables = []
    p_values = np.ones(len(variables))

    for _ in range(5): 
        best_pval = 1 
        best_variable = None

        for i, candidate_variable in enumerate(variables):
            if candidate_variable in selected_variables:
                continue

            X_selected = X[selected_variables + [candidate_variable]]
            X_selected = np.column_stack((np.ones(len(X_selected)), X_selected))

            beta = np.linalg.inv(X_selected.T @ X_selected) @ X_selected.T @ Y

            error = Y - X_selected @ beta

            residual_variance = np.sum(error**2) / (len(Y) - len(beta))

            beta_variance = np.linalg.inv(X_selected.T @ X_selected) * residual_variance

            t_stat = beta[1] / np.sqrt(beta_variance[1, 1])
            p_value = 2 * (1 - stats.t.cdf(np.abs(t_stat), len(Y) - len(beta)))

            if p_value < best_pval:
                best_pval = p_value
                best_variable = candidate_variable

        selected_variables.append(best_variable)
        p_values[i] = best_pval

    X_final = X[selected_variables]
    X_final = np.column_stack((np.ones(len(X_final)), X_final))
    beta_final = np.linalg.inv(X_final.T @ X_final) @ X_final.T @ Y
    predictions = X_final @ beta_final
    predictions = (predictions >= 0.5).astype(int)

    accuracy, precision, recall, f1 = calculate_metrics(predictions, Y)

    return beta_final, selected_variables, accuracy, precision, recall, f1



result_RM1 = RM1(X, Y)
print("Beta coefficients:", result_RM1[0])
print("Selected Variables:", result_RM1[1])
print("Accuracy:", result_RM1[2])
print("Precision:", result_RM1[3])
print("Recall:", result_RM1[4])
print("F1-Score:", result_RM1[5])


Beta coefficients: [-1.66384367  0.06730898  0.0218041   5.67940239 -0.01612008  1.3440788 ]
Selected Variables: ['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean']
Accuracy: 0.9226713532513181
Precision: 0.9827586206896551
Recall: 0.8066037735849056
F1-Score: 0.8860103626943006


# Backward Elimination

In [95]:

def RM2(X, Y):
    variables = X.columns.tolist()
    p_values = np.ones(len(variables)) 

    X_full = X.copy()
    X_full = np.column_stack((np.ones(len(X_full)), X_full))
    beta = np.linalg.inv(X_full.T @ X_full) @ X_full.T @ Y

    while True:  
        worst_pval = 0 
        worst_variable = None
        p_values_updated = np.zeros(len(variables)) + 1  

        for i, variable in enumerate(variables):
            X_temp = X_full.copy()
            X_temp = np.delete(X_temp, i+1, axis=1)  

            beta_temp = np.linalg.inv(X_temp.T @ X_temp) @ X_temp.T @ Y

            error = Y - X_temp @ beta_temp

            residual_variance = np.sum(error**2) / (len(Y) - len(beta_temp))

            beta_variance = np.linalg.inv(X_temp.T @ X_temp) * residual_variance

            t_stat = beta_temp[i] / np.sqrt(beta_variance[i, i])
            p_value = 2 * (1 - stats.t.cdf(np.abs(t_stat), len(Y) - len(beta_temp)))

            p_values_updated[i] = p_value

            if p_value > 0.05 and p_value > worst_pval:
                worst_pval = p_value
                worst_variable = variable

        if worst_variable is None: 
            break

        if np.all(p_values_updated <= 0.05):  
            break

        X_full = np.delete(X_full, variables.index(worst_variable) + 1, axis=1)
        p_values = np.delete(p_values, variables.index(worst_variable))
        variables.remove(worst_variable)

    beta_final = np.linalg.inv(X_full.T @ X_full) @ X_full.T @ Y
    predictions = X_full @ beta_final
    predictions = (predictions >= 0.5).astype(int)

    accuracy, precision, recall, f1 = calculate_metrics(predictions, Y)

    return beta_final, variables, accuracy, precision, recall, f1



result_RM2 = RM2(X, Y)

print("Beta coefficients:", result_RM2[0])
print("Selected Variables:", result_RM2[1])
print("Accuracy:", result_RM2[2])
print("Precision:", result_RM2[3])
print("Recall:", result_RM2[4])
print("F1-Score:", result_RM2[5])


Beta coefficients: [-1.94125565e+00  3.97765094e-01  2.23771827e-02 -4.17450476e-02
 -9.38140495e-04  1.43733490e+00  7.85638119e+00  1.06892162e+00]
Selected Variables: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'concave points_mean', 'symmetry_mean']
Accuracy: 0.9402460456942003
Precision: 0.973404255319149
Recall: 0.8632075471698113
F1-Score: 0.915


In [4]:
def RM3(X, Y):
    variables = X.columns.tolist()
    errors = np.zeros(len(variables))  

    X_model = np.column_stack((np.ones(len(X)), X))
    beta = np.linalg.inv(X_model.T @ X_model) @ X_model.T @ Y

    predictions = X_model @ beta
    predictions = (predictions >= 0.5).astype(int)
    accuracy, precision, recall, f1 = calculate_metrics(predictions, Y)

    while len(variables) > 5: 
        errors = []
        for i, variable in enumerate(variables):
            X_temp = X.drop(columns=[variable])
            X_temp_model = np.column_stack((np.ones(len(X_temp)), X_temp))
            beta_temp = np.linalg.inv(X_temp_model.T @ X_temp_model) @ X_temp_model.T @ Y

            error = Y - X_temp_model @ beta_temp
            residual_variance = np.sum(error**2) / (len(Y) - len(beta_temp))

            error = residual_variance
            errors.append(error)

        max_error_index = np.argmax(errors)
        removed_variable = variables.pop(max_error_index)

        X_model = np.column_stack((np.ones(len(X)), X.loc[:, variables]))
        beta = np.linalg.inv(X_model.T @ X_model) @ X_model.T @ Y

        predictions = X_model @ beta
        predictions = (predictions >= 0.5).astype(int)
        accuracy, precision, recall, f1 = calculate_metrics(predictions, Y)

    X_final = np.column_stack((np.ones(len(X)), X.loc[:, variables]))
    beta_final = np.linalg.inv(X_final.T @ X_final) @ X_final.T @ Y
    predictions_final = X_final @ beta_final
    predictions_final = (predictions_final >= 0.5).astype(int)

    accuracy_final, precision_final, recall_final, f1_final = calculate_metrics(predictions_final, Y)

    errors_formatted = ["{:.20f}".format(e) for e in errors]

    return beta_final, variables, errors_formatted, accuracy_final, precision_final, recall_final, f1_final


result_RM3 = RM3(X, Y)

print("Beta coefficients:", result_RM3[0])
print("Selected Variables:", result_RM3[1])
print("Errors:", result_RM3[2])
print("Accuracy:", result_RM3[3])
print("Precision:", result_RM3[4])
print("Recall:", result_RM3[5])
print("F1-Score:", result_RM3[6])


Beta coefficients: [  0.8464616   10.91991043   1.98812073 -38.13542014   0.96717527
   1.90303784]
Selected Variables: ['smoothness_mean', 'symmetry_mean', 'fractal_dimension_mean', 'compactness_se', 'compactness_worst']
Errors: ['0.05569145799314622475', '0.05569325503317724274', '0.05569127923740923441', '0.05569475869494606990', '0.05569136819526819548', '0.05569445695099326660']
Accuracy: 0.8840070298769771
Precision: 0.8882978723404256
Recall: 0.7877358490566038
F1-Score: 0.835


In [5]:
def RM4(X, Y):
    variables = X.columns.tolist()
    selected_variables = []  
    errors = np.zeros(len(variables))  
    alpha = 1e-5  


    while len(selected_variables) < 5:  
        errors = []
        for i, variable in enumerate(variables):
            selected_variables_temp = selected_variables + [variable]
            X_temp = X.loc[:, selected_variables_temp]
            X_temp_model = np.column_stack((np.ones(len(X_temp)), X_temp))
            beta_temp = np.linalg.pinv(X_temp_model.T @ X_temp_model + alpha * np.eye(X_temp_model.shape[1])) @ X_temp_model.T @ Y


            error = Y - X_temp_model @ beta_temp


            residual_variance = np.sum(error**2) / (len(Y) - len(beta_temp))

            error = residual_variance
            errors.append(error)

        max_error_index = np.argmax(errors)
        added_variable = variables.pop(max_error_index)  
        selected_variables.append(added_variable)

        X_model = np.column_stack((np.ones(len(X)), X.loc[:, selected_variables]))
        beta = np.linalg.pinv(X_model.T @ X_model + alpha * np.eye(X_model.shape[1])) @ X_model.T @ Y

        predictions = X_model @ beta
        predictions = (predictions >= 0.5).astype(int)
        accuracy, precision, recall, f1 = calculate_metrics(predictions, Y)

    X_final = np.column_stack((np.ones(len(X)), X.loc[:, selected_variables]))
    beta_final = np.linalg.pinv(X_final.T @ X_final + alpha * np.eye(X_final.shape[1])) @ X_final.T @ Y
    predictions_final = X_final @ beta_final
    predictions_final = (predictions_final >= 0.5).astype(int)

    accuracy_final, precision_final, recall_final, f1_final = calculate_metrics(predictions_final, Y)

    errors_formatted = ["{:.20f}".format(e) for e in errors]

    return beta_final, selected_variables, errors_formatted, accuracy_final, precision_final, recall_final, f1_final

# Example of usage
result_RM4 = RM4(X, Y)

# Print the results
print("Beta coefficients:", result_RM4[0])
print("Selected Variables:", result_RM4[1])
print("Errors:", result_RM4[2])
print("Accuracy:", result_RM4[3])
print("Precision:", result_RM4[4])
print("Recall:", result_RM4[5])
print("F1-Score:", result_RM4[6])


Beta coefficients: [ 8.08909090e-01  2.32359589e-01 -3.90852667e-03 -7.15154065e+00
 -1.74392461e+01  3.57179804e+01]
Selected Variables: ['symmetry_se', 'texture_se', 'fractal_dimension_mean', 'smoothness_se', 'fractal_dimension_se']
Errors: ['0.09794496480453095122', '0.18567533200294955575', '0.09726823590372976613', '0.10809661032147580007', '0.18320649473824718312', '0.10854021797970433982', '0.10459733523212616513', '0.08832782175581098405', '0.19622142771977690856', '0.14989124026715500149', '0.15343055373104669270', '0.15961325786696312079', '0.20061536120613551737', '0.21381656618907221579', '0.18313706637885834860', '0.23059722870082310919', '0.08434512152275065733', '0.17021634390816522964', '0.08512548817296522463', '0.10260849039368725999', '0.16292189762730399649', '0.12347694565004965761', '0.11463970235402133613', '0.07775361650766808574', '0.17366469706889411939', '0.16263911254475738044']
Accuracy: 0.616871704745167
Precision: 0.35
Recall: 0.0330188679245283
F1-Score:

In [6]:
def RM5(X, Y):
    correlations = np.abs(X.corrwith(Y))

    ranked_variables = correlations.sort_values(ascending=False)

    selected_variables = ranked_variables.index[:5].tolist()
    X_final = X.loc[:, selected_variables]
    X_final_model = np.column_stack((np.ones(len(X_final)), X_final))
    beta_final = np.linalg.pinv(X_final_model.T @ X_final_model) @ X_final_model.T @ Y
    predictions_final = X_final_model @ beta_final
    predictions_final = (predictions_final >= 0.5).astype(int)

    accuracy_final, precision_final, recall_final, f1_final = calculate_metrics(predictions_final, Y)

    return beta_final, selected_variables, accuracy_final, precision_final, recall_final, f1_final

result_RM5 = RM5(X, Y)

print("Beta coefficients:", result_RM5[0])
print("Selected Variables:", result_RM5[1])
print("Accuracy:", result_RM5[2])
print("Precision:", result_RM5[3])
print("Recall:", result_RM5[4])
print("F1-Score:", result_RM5[5])


Beta coefficients: [-0.53579971  2.96988832 -0.00666432  2.2848403   0.10656923 -0.00611867]
Selected Variables: ['concave points_worst', 'perimeter_worst', 'concave points_mean', 'radius_worst', 'perimeter_mean']
Accuracy: 0.945518453427065
Precision: 0.994535519125683
Recall: 0.8584905660377359
F1-Score: 0.9215189873417722


In [105]:
data = {
    'RM1': result_RM1[1],
    'RM2': result_RM2[1],
    'RM3': result_RM3[1],
    'RM4': result_RM4[1],
    'RM5': result_RM5[1]
}

df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data.items()]))

print(df)


                RM1                  RM2                     RM3  \
0       radius_mean          radius_mean         smoothness_mean   
1      texture_mean         texture_mean           symmetry_mean   
2   smoothness_mean       perimeter_mean  fractal_dimension_mean   
3  compactness_mean            area_mean          compactness_se   
4    concavity_mean      smoothness_mean       compactness_worst   
5               NaN  concave points_mean                     NaN   
6               NaN        symmetry_mean                     NaN   

                      RM4                   RM5  
0             symmetry_se  concave points_worst  
1              texture_se       perimeter_worst  
2  fractal_dimension_mean   concave points_mean  
3           smoothness_se          radius_worst  
4    fractal_dimension_se        perimeter_mean  
5                     NaN                   NaN  
6                     NaN                   NaN  


In [106]:

data = {
    'RM1': result_RM1[-4:],
    'RM2': result_RM2[-4:],
    'RM3': result_RM3[-4:],
    'RM4': result_RM4[-4:],
    'RM5': result_RM5[-4:]
}
df = pd.DataFrame(data, index=['Accuracy', 'Precision', 'Recall', 'F1-Score'])

print(df)


                RM1       RM2       RM3       RM4       RM5
Accuracy   0.922671  0.940246  0.884007  0.616872  0.945518
Precision  0.982759  0.973404  0.888298  0.350000  0.994536
Recall     0.806604  0.863208  0.787736  0.033019  0.858491
F1-Score   0.886010  0.915000  0.835000  0.060345  0.921519
