In [6]:
import pandas as pd
df = pd.read_csv("balanced.csv")
df

Unnamed: 0,GenHlth,HighBP,Age,HighChol,DiffWalk,Income,HeartDiseaseorAttack,Membership,Diabetes_012,BMI,PhysHlth
0,1,0,3,0,0,5,0,0,0,26,0
1,1,0,3,0,0,5,0,0,0,26,0
2,1,0,3,0,0,5,0,0,0,23,15
3,1,0,3,0,0,5,0,0,0,30,0
4,1,0,3,0,0,5,0,0,0,19,0
...,...,...,...,...,...,...,...,...,...,...,...
15995,4,1,4,1,1,1,1,16,0,37,28
15996,4,1,4,1,0,5,0,16,0,34,0
15997,3,1,4,1,1,2,1,16,0,20,0
15998,3,1,4,1,1,1,0,16,1,34,0


In [9]:
df.columns

Index(['GenHlth', 'HighBP', 'Age', 'HighChol', 'DiffWalk', 'Income',
       'HeartDiseaseorAttack', 'Membership', 'Diabetes_012', 'BMI',
       'PhysHlth'],
      dtype='object')

In [10]:
# Seperate predictor and response
X = df[['HighBP', 'HighChol', 'DiffWalk', 'HeartDiseaseorAttack', 'BMI', 'PhysHlth', 'GenHlth', 'Age', 'Income', 'Membership']]
y = df['Diabetes_012'].astype(int)

In [11]:
################ Unsupervised outlier detection: Interquartile Range (IQR) ####################

def remove_outliers_IQR(group):
    # For BMI
    Q1_BMI = group['BMI'].quantile(0.25)
    Q3_BMI = group['BMI'].quantile(0.75)
    IQR_BMI = Q3_BMI - Q1_BMI
    lower_bound_BMI = Q1_BMI - 1.5 * IQR_BMI
    upper_bound_BMI = Q3_BMI + 1.5 * IQR_BMI
    
    # For PhysHlth
    Q1_PhysHlth = group['PhysHlth'].quantile(0.25)
    Q3_PhysHlth = group['PhysHlth'].quantile(0.75)
    IQR_PhysHlth = Q3_PhysHlth - Q1_PhysHlth
    lower_bound_PhysHlth = Q1_PhysHlth - 1.5 * IQR_PhysHlth
    upper_bound_PhysHlth = Q3_PhysHlth + 1.5 * IQR_PhysHlth

    # Filter the group
    return group[(group['BMI'] >= lower_bound_BMI) & (group['BMI'] <= upper_bound_BMI) &
                 (group['PhysHlth'] >= lower_bound_PhysHlth) & (group['PhysHlth'] <= upper_bound_PhysHlth)]

# Apply the function to each group and concatenate the results
cleaned_X = X.groupby('Membership').apply(remove_outliers_IQR).reset_index(drop=True)

original_count = X.shape[0]
cleaned_count = cleaned_X.shape[0]
outliers_count = original_count - cleaned_count
print(f"Number of outliers removed: {outliers_count}")

Number of outliers removed: 2129


In [2]:
# Drop the 'Membership' column
df = df.drop(columns=['Membership'])
df.columns

Index(['GenHlth', 'HighBP', 'Age', 'HighChol', 'DiffWalk', 'Income',
       'HeartDiseaseorAttack', 'Diabetes_012', 'BMI', 'PhysHlth'],
      dtype='object')

In [3]:
# Seperate predictor and response
X = df[['HighBP', 'HighChol', 'DiffWalk', 'HeartDiseaseorAttack', 'BMI', 'PhysHlth', 'GenHlth', 'Age', 'Income']]
y = df['Diabetes_012'].astype(int)

In [10]:
################ Unsupervised outlier detection: OneClassSVM ####################

from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# categorical and numerical features
categorical_features = ['GenHlth', 'HighBP', 'Age', 'HighChol', 'DiffWalk', 'Income','HeartDiseaseorAttack']
numerical_features = ['BMI','PhysHlth']

# preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# pipeline: preprocessing & modeling
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', OneClassSVM(kernel='rbf', gamma='auto'))
])

# fit
model.fit(X)

# predict outliers
outliers = model.predict(X)

# results
outlier_results = pd.DataFrame({'Outlier': outliers}, index=X.index)
print(outlier_results)

       Outlier
0            1
1            1
2           -1
3            1
4           -1
...        ...
15995       -1
15996       -1
15997       -1
15998       -1
15999       -1

[16000 rows x 1 columns]


In [11]:
# count outlier (-1): no way, half of the data aree outliers -- One class SVM is not very good for this dataset
outlier_results.value_counts()

Outlier
 1         8003
-1         7997
Name: count, dtype: int64

In [17]:
################ Unsupervised outlier detection: DBScan ####################

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN

# categorical and numerical features
categorical_features = ['GenHlth', 'HighBP', 'Age', 'HighChol', 'DiffWalk', 'Income', 'HeartDiseaseorAttack']
numerical_features = ['BMI', 'PhysHlth']

# preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])
X_processed = preprocessor.fit_transform(X)

# DBSCAN model
dbscan = DBSCAN(eps=1, min_samples=5, metric='euclidean')

# fit and predict clusters
clusters = dbscan.fit_predict(X_processed)

# find outlier indices (outliers labeled as -1)
outlier_indices = pd.Series(clusters == -1, index=X.index)

# results
print("Outlier indices:")
print(outlier_indices[outlier_indices].index)

Outlier indices:
Index([   26,   162,   221,   443,   537,   742,   843,  1007,  1086,  1108,
       ...
       15955, 15958, 15961, 15964, 15968, 15971, 15978, 15979, 15989, 15990],
      dtype='int64', length=2953)


In [19]:
# count outlier (-1): 2953 are outliers -- DBScan is not very good for this dataset
outlier_indices.sum()

2953

In [4]:
################ Unsupervised outlier detection: Isolation Forest ####################

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# categorical and numerical features
categorical_features = ['GenHlth', 'HighBP', 'Age', 'HighChol', 'DiffWalk', 'Income', 'HeartDiseaseorAttack']
numerical_features = ['BMI', 'PhysHlth']

# preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# pipeline: preprocessing & modeling
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', IsolationForest(n_estimators=100, random_state=42, contamination='auto'))
])

# fit
pipeline.fit(X)

# predict outliers
outliers = pipeline.predict(X)
outlier_predictions = pd.DataFrame(outliers, index=X.index, columns=['Outlier'])

# Count the outliers
outlier_count = (outlier_predictions['Outlier'] == -1).sum()
print(f"Number of outliers detected: {outlier_count}")

# Merge with original data for better analysis (optional)
#detailed_results = pd.concat([X, outlier_predictions], axis=1)
#print(detailed_results)

Number of outliers detected: 6831
