In [None]:
import pandas as pd

file_path = '/content/sample_data/heart_disease_uci.csv'
df = pd.read_csv(file_path)

columns_with_missing_values = ['trestbps', 'chol', 'thalch', 'oldpeak','fbs', 'restecg', 'ca', 'slope', 'thal']

df.dropna(subset=columns_with_missing_values, inplace=True)

new_file_path = '/content/sample_data/missing.csv'
df.to_csv(new_file_path, index=False)


In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
file_path = '/content/sample_data/missing.csv'
df = pd.read_csv(file_path)
label_encoder = LabelEncoder()
columns=['sex','cp','fbs','restecg','exang','slope','thal']
for column in columns:
    df[column] = label_encoder.fit_transform(df[column])

new_file_path = '/content/sample_data/encoded.csv'
df.to_csv(new_file_path, index=False)

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
file_path = '/content/sample_data/encoded.csv'
df = pd.read_csv(file_path)

columns_tonormalize = ['age','cp','trestbps', 'chol', 'restecg', 'thalch', 'oldpeak', 'slope', 'ca', 'thal']
scaler = MinMaxScaler()
df[columns_tonormalize] = scaler.fit_transform(df[columns_tonormalize])

new_file_path = '/content/sample_data/normalized.csv'
df.to_csv(new_file_path, index=False)

In [None]:
from sklearn.ensemble import IsolationForest
import pandas as pd
file_path = '/content/sample_data/normalized.csv'
df = pd.read_csv(file_path)

isolation_forest = IsolationForest(contamination=0.1, random_state=42)
outliers = isolation_forest.fit_predict(df)

df_filtered = df[outliers != -1]

new_file_path = '/content/sample_data/outlier.csv'
df_filtered.to_csv(new_file_path, index=False)

In [None]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
file_path = '/content/sample_data/outlier.csv'
df = pd.read_csv(file_path)

X = df.drop('num', axis=1)
y = df['num']


smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['num'] = y_resampled
resampled_df = resampled_df.sample(frac=1)
new_file_path = '/content/sample_data/smote.csv'
resampled_df.to_csv(new_file_path, index=False)


In [None]:
from sklearn.ensemble import IsolationForest
import pandas as pd
file_path = '/content/sample_data/smote.csv'
df = pd.read_csv(file_path)

isolation_forest = IsolationForest(contamination=0.1, random_state=42)
outliers = isolation_forest.fit_predict(df)

df_filtered = df[outliers != -1]

new_file_path = '/content/sample_data/outlier.csv'
df_filtered.to_csv(new_file_path, index=False)

In [None]:
import pandas as pd
from math import log2

def calculate_entropy(data):

    value_counts = data.value_counts(normalize=True)
    entropy = -sum(p * log2(p) for p in value_counts)
    return entropy

def calculate_joint_entropy(x, y):

    joint_data = pd.concat([x, y], axis=1)
    joint_entropy = calculate_entropy(joint_data)
    return joint_entropy

def calculate_su(x, y):

    entropy_x = calculate_entropy(x)
    entropy_y = calculate_entropy(y)
    joint_entropy_xy = calculate_joint_entropy(x, y)
    su = 2 * (entropy_x - joint_entropy_xy) / (entropy_x + entropy_y)
    return su

def feature_selection_by_su(data, target_column, threshold=0.7):

    su_values = {} #  su between feature and target
    for column in data.columns:
        if column != target_column:
            su_values[column] = calculate_su(data[column], data[target_column])

    selected_columns = []
#    to remove less correlated features
    sorted_su_values = sorted(su_values.items(), key=lambda x: x[1], reverse=True)
    last_item_su_value = sorted_su_values[-1][1]
    if -0.02 <= last_item_su_value <= 0:
        column_to_drop = sorted_su_values[-1][0]
        data.drop(columns=[column_to_drop], inplace=True)

#    su between feature to feature
    su_values = {}
    for column in data.columns:
        if column != target_column:
            su_values[column] = []
            for other_column in data.columns:
                if other_column != column and other_column != target_column:
                    su_value = calculate_su(data[column], data[other_column])
                    su_values[column].append((other_column, su_value))

    print('Target correlation')
    print(sorted_su_values)
#    for redundant features removal
    tosort = []
    highly_correlated_features = []

    for feature, correlations in su_values.items():
        for correlation in correlations:
            other_feature, su_value = correlation
            tosort.append((feature, other_feature, su_value))

#     unique_tosort = set(tosort)
#     print(len(unique_tosort))
    sorted_unique_tosort = sorted(tosort, key=lambda x: abs(x[2]))
    print('Overlapping of features')
    print(sorted_unique_tosort)

    index_map = {feature: i for i, (feature, _) in enumerate(sorted_su_values)}

    features_to_remove = []

    for i in range(-3, 0):
        f1, f2, _ = sorted_unique_tosort[i]
        if f1 in [item[0] for item in sorted_su_values] and f2 in [item[0] for item in sorted_su_values]:
            f1_index = index_map[f1]
            f2_index = index_map[f2]
            feature_to_remove = f1 if f1_index > f2_index else f2
#             print(feature_to_remove)
            features_to_remove.append(feature_to_remove)


    if features_to_remove:
        max_index = max(index_map[feature] for feature in features_to_remove)
        feature_to_remove = next(feature for feature, index in index_map.items() if index == max_index)
        data.drop(columns=[feature_to_remove], inplace=True)



    selected_columns.extend(data.columns)
    return data, selected_columns


file_path = '/content/sample_data/smoteagain.csv'
data = pd.read_csv(file_path)
target_column = 'num'
# print(data.columns)
updated_data, selected_columns = feature_selection_by_su(data, target_column)
new_file_path = '/content/sample_data/fcbfdata.csv'
updated_data.to_csv(new_file_path, index=False)

print("Columns selected by SU feature selection method:")
print(selected_columns)



Target correlation
[('chol', -0.013593902300911086), ('thalch', -0.021985098843714235), ('age', -0.04193520158417837), ('oldpeak', -0.05294047079278687), ('trestbps', -0.053183258567897254), ('ca', -0.21501819608645567), ('thal', -0.4901766309415492), ('cp', -0.5767915480527931), ('restecg', -0.6593061012316944), ('slope', -0.6795691295407513), ('exang', -1.2827238980468072), ('sex', -1.735960382385049), ('fbs', -1.9835528936757367)]
Overlapping of features
[('chol', 'fbs', -0.0006249783293560339), ('thalch', 'fbs', -0.00103305780782925), ('age', 'fbs', -0.001050643189835077), ('trestbps', 'fbs', -0.0012955393460958053), ('oldpeak', 'fbs', -0.0016603698369147498), ('chol', 'sex', -0.0022034778988785134), ('ca', 'fbs', -0.004293183637767436), ('thalch', 'sex', -0.005484189219011797), ('age', 'sex', -0.006693012022777328), ('thal', 'fbs', -0.007041948734600569), ('chol', 'thal', -0.00896177533245242), ('chol', 'restecg', -0.009180538900846729), ('chol', 'exang', -0.009489248362598532), (

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

file_path = '/content/sample_data/fcbfdata.csv'
data = pd.read_csv(file_path)

target_column = "num"

X = data.drop(columns=[target_column])
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

et_classifier = ExtraTreesClassifier(n_estimators=100, random_state=42)


et_classifier.fit(X_train, y_train)

y_pred = et_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.9637681159420289


In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from imblearn.over_sampling import SMOTE

file_path = r'C:\Users\THASNEEM FATHIMA\Downloads\heart_disease_uci.csv'
df = pd.read_csv(file_path)
columns_with_missing_values = ['trestbps', 'chol', 'thalch', 'oldpeak','fbs', 'restecg', 'ca', 'slope', 'thal']
# Drop missing values from specified columns
df.dropna(subset=columns_with_missing_values, inplace=True)
# Encoding string to numerical
label_encoder = LabelEncoder()
columns=['sex','cp','fbs','restecg','exang','slope','thal']
for column in columns:
    df[column] = label_encoder.fit_transform(df[column])
# Normalization
columns_tonormalize = ['age','cp','trestbps', 'chol', 'restecg', 'thalch', 'oldpeak', 'slope', 'ca', 'thal']
scaler = MinMaxScaler()
df[columns_tonormalize] = scaler.fit_transform(df[columns_tonormalize])
# Outlier removal
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
outliers = isolation_forest.fit_predict(df)
df_filtered = df[outliers != -1]
# SMOTE - Oversampling
X = df.drop('num', axis=1)
y = df['num']
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['num'] = y_resampled
resampled_df = resampled_df.sample(frac=1)
new_file_path = r'C:\Users\THASNEEM FATHIMA\Downloads\resampled_dataset.csv'
resampled_df.to_csv(new_file_path, index=False)
print(resampled_df)

          age  sex        cp  trestbps      chol  fbs  restecg    thalch  \
67   0.520833    1  0.666667  0.528302  0.284483    0  0.00000  0.717557   
137  0.125000    1  0.000000  0.245283  0.211207    0  0.50000  0.450382   
461  0.504182    0  0.000000  0.599141  0.467896    0  0.77509  0.436989   
578  0.452121    1  0.000000  0.150943  0.208647    0  0.00000  0.293233   
17   0.520833    1  0.000000  0.433962  0.299569    0  0.50000  0.679389   
..        ...  ...       ...       ...       ...  ...      ...       ...   
760  0.748302    0  0.000000  0.528302  0.285379    0  0.00000  0.340686   
765  0.614863    1  0.000000  0.501108  0.252101    0  0.00000  0.475486   
751  0.593214    1  0.000000  0.482587  0.318808    0  0.00000  0.515777   
626  0.484631    1  0.000000  0.314878  0.373222    0  0.50000  0.402161   
160  1.000000    1  0.000000  0.292453  0.439655    0  0.00000  0.694656   

     exang   oldpeak  slope        ca      thal  num  
67       0  0.258065    1.0  0.0

In [None]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.
