In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split , GridSearchCV , cross_validate, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier , GradientBoostingClassifier , AdaBoostClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score ,f1_score, confusion_matrix , classification_report, recall_score, precision_score

import pickle 

: 

In [None]:
data = pd.read_csv('arrhythmia.csv')

In [None]:
data.shape

In [None]:
column=data.columns

In [None]:
data['class'].value_counts()

In [None]:
cols =[ 'Normal', 
'Ischemic changes (Coronary Artery Disease)',
'Old Anterior Myocardial Infarction' ,
'Old Inferior Myocardial Infarction' ,
'Sinus tachycardy' ,
'Sinus bradycardy' ,
'Ventricular Premature Contraction (PVC)' ,
'Supraventricular Premature Contraction',
'Left bundle branch block' ,
 'Right bundle branch block',
 '1. degree AtrioVentricular block' ,
 '2. degree AV block' ,
 '3. degree AV block',
'Left ventricule hypertrophy' ,
'Atrial Fibrillation or Flutter' ,
'Others'] 

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
y = data['class']

In [None]:
X = data.drop('class',axis=1)

In [None]:
a=[]
for col in X.columns:
    if X[col].isnull().sum():
        a.append(col)

In [None]:
for i in a:
    nan_count = data.loc[:,i].isna().sum()
    print(f"Column {i} has {nan_count/X.shape[0]*100} % NaN values")

In [None]:
# J column have  83% nan value hence we are dropping this column 
# and for rest we will use simple imputer for imputing the values 
X_new = X.drop('J',axis=1)

In [None]:
column = ['T','P','QRST','heartrate']

In [None]:
a

In [None]:
for col in column:
    plt.figure(figsize=(12, 6))
    
    # Mean and median line plot
    plt.figure(figsize=(8, 6))
    
    # Distribution KDE plot
    sns.kdeplot(X_new[col], shade=True)
    
    # Vertical lines for mean and median
    mean_val = X_new[col].mean()
    median_val = X_new[col].median()
    plt.axvline(x=mean_val, color='skyblue', linestyle='--', label='Mean')
    plt.axvline(x=median_val, color='orange', linestyle='--', label='Median')
    
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.title(f'Distribution of {col} with Mean and Median')
    plt.legend()
    
    plt.show()

In [None]:
Si = SimpleImputer(strategy='mean')

In [None]:
X_transform = Si.fit_transform(X_new)

In [None]:
X_t= pd.DataFrame(X_transform , columns=X_new.columns)

In [None]:
X_t.head()

In [None]:
for col in column:
    print(X_t[col].isnull().sum())

In [None]:
sampling_strategy = { 2: 100, 3: 100, 4: 100,5:100,6:100,7:100,8:100,9:100,10:100,14:100,15:100,16:100}

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Assuming X and y are your feature and target variables
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_t, y)

In [None]:
X_resampled.shape

In [None]:
X_train ,X_test , y_train ,y_test = train_test_split(X_resampled,y_resampled,test_size=0.2)

In [None]:
metrics = {}

In [None]:
def train_models(pipeline,name):
    # Fit the pipeline
   

    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test,y_pred,average='weighted')

    metrics[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall , 'f1':f1}
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    # plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=pipeline.named_steps['classifier'].classes_,
                yticklabels=pipeline.named_steps['classifier'].classes_)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()
    
    return metrics , pipeline

In [None]:
y_test

In [None]:
X_test.shape

Algorithms on Balance Dataset 

In [None]:
logistic_clf1 = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
])

logistic_metrics, logistic_model = train_models(logistic_clf1, "Logistic Regression")
filename = 'logistc_model.pkl'
pickle.dump(logistic_model, open(filename, 'wb')) 

In [None]:
svc_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])


svc_metrics , svc_model=train_models(svc_clf , "svc_model")

filename1 = 'svc_model.pkl'
pickle.dump(svc_clf, open(filename1, 'wb')) 
  
# load the model 
load_model = pickle.load(open(filename1, 'rb')) 
  
y_pred = load_model.predict(X_test) 

In [None]:
random_clf =Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ])


randmon_metrics , random_model = train_models(random_clf , "Random_Forest")
filename = 'random_model.pkl'
pickle.dump(random_model, open(filename, 'wb')) 


In [None]:
knn =Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier())
    ])


knnmetrics , knn_model = train_models(knn , "Knn")
filename = 'knn.pkl'
pickle.dump(knn_model, open(filename, 'wb')) 


Algorithms on Imbalance Dataset

In [None]:
X_train1,X_test1,y_train1,y_test1=train_test_split(X_t,y,test_size=0.2) # when data is imbalance

In [None]:
metrics1 = {}

In [None]:
def train_models1(pipeline,name):
    # Fit the pipeline
  

    pipeline.fit(X_train1, y_train1)
    
    # Make predictions
    y_pred = pipeline.predict(X_test1)
    accuracy = accuracy_score(y_test1, y_pred)
    precision = precision_score(y_test1, y_pred, average='weighted')
    recall = recall_score(y_test1, y_pred, average='weighted')
    f1 = f1_score(y_test1,y_pred,average='weighted')

    metrics1[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall , 'f1':f1}
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_test1, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test1, y_pred)
    # plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=pipeline.named_steps['classifier'].classes_,
                yticklabels=pipeline.named_steps['classifier'].classes_)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()
    
    return metrics , pipeline

In [None]:
logistic_clf2 = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
])

logistic_metrics1, logistic_model = train_models1(logistic_clf2, "Logistic Regression")
filename = 'logistc_model1.pkl'
pickle.dump(logistic_model, open(filename, 'wb')) 

In [None]:
svc_clf1 = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])


svc_metrics1 , svc_model1=train_models1(svc_clf1 , "svc_model")

filename1 = 'svc_model1.pkl'
pickle.dump(svc_model1, open(filename1, 'wb')) 
  
# load the model 


In [None]:
random_clf1 =Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ])


randmon_metrics1 , random_model1 = train_models1(random_clf , "Random_Forest")
filename = 'random_model1.pkl'
pickle.dump(random_model1, open(filename, 'wb')) 

In [None]:
knn1 =Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier())
    ])


knnmetrics1 , knn_model1 = train_models1(knn , "Knn")
filename = 'knn1.pkl'
pickle.dump(knn_model1, open(filename, 'wb')) 

# Comparison of algorithms When Data is balance 

In [None]:
algos = list(metrics.keys())
accuracy_values = [metrics[algo]['Accuracy'] for algo in algos]
precision_values = [metrics[algo]['Precision'] for algo in algos]
recall_values = [metrics[algo]['Recall'] for algo in algos]
f1_values = [metrics[algo]['f1'] for algo in algos]

# Plotting the metrics as line graphs
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

ax[0, 0].plot(algos, accuracy_values, marker='o', color='b', label='Accuracy')
ax[0, 0].set_title('Accuracy')
ax[0, 0].legend()

ax[0, 1].plot(algos, precision_values, marker='o', color='g', label='Precision')
ax[0, 1].set_title('Precision')
ax[0, 1].legend()

ax[1, 0].plot(algos, recall_values, marker='o', color='r', label='Recall')
ax[1, 0].set_title('Recall')
ax[1, 0].legend()

ax[1, 1].plot(algos, f1_values, marker='o', color='orange', label='F1 Score')
ax[1, 1].set_title('F1 Score')
ax[1, 1].legend()

plt.tight_layout()
plt.show()

# Comparison of algorithms When Data is Imbalance 

In [None]:
algos = list(metrics1.keys())
accuracy_values = [metrics1[algo]['Accuracy'] for algo in algos]
precision_values = [metrics1[algo]['Precision'] for algo in algos]
recall_values = [metrics1[algo]['Recall'] for algo in algos]
f1_values = [metrics1[algo]['f1'] for algo in algos]

# Plotting the metrics as line graphs
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

ax[0, 0].plot(algos, accuracy_values, marker='o', color='b', label='Accuracy')
ax[0, 0].set_title('Accuracy')
ax[0, 0].legend()

ax[0, 1].plot(algos, precision_values, marker='o', color='g', label='Precision')
ax[0, 1].set_title('Precision')
ax[0, 1].legend()

ax[1, 0].plot(algos, recall_values, marker='o', color='r', label='Recall')
ax[1, 0].set_title('Recall')
ax[1, 0].legend()

ax[1, 1].plot(algos, f1_values, marker='o', color='orange', label='F1 Score')
ax[1, 1].set_title('F1 Score')
ax[1, 1].legend()

plt.tight_layout()
plt.show()

Conclusion : 
In conclusion, Random Forest and Logistic Regression demonstrate robust performance across imbalanced datasets, outperforming other algorithms even after dataset balancing. Their effectiveness suggests their suitability for handling imbalanced data scenarios. 

HYBRID MODEL 

In [None]:
## best two model are random forest and logistic regression

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=1)
lr = LogisticRegression(random_state=1)

hybrid_model = VotingClassifier(estimators=[('rf', rf), ('lr', lr)], voting='soft')

filename = 'hybrid.pkl'
pickle.dump(hybrid_model, open(filename, 'wb')) 
hybrid_model.fit(X_train_scaled, y_train)


y_pred = hybrid_model.predict(X_test_scaled)


accuracy = accuracy_score(y_test, y_pred)
print("Hybrid Model Accuracy:", accuracy)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test,y_pred,average='weighted')

# metrics1[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall , 'f1':f1}

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
# plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=hybrid_model.classes_,
            yticklabels=hybrid_model.classes_)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train1)
X_test_scaled = scaler.transform(X_test1)

rf = RandomForestClassifier(n_estimators=100, random_state=1)
lr = LogisticRegression(random_state=1)

hybrid_model = VotingClassifier(estimators=[('rf', rf), ('lr', lr)], voting='soft')

filename = 'hybrid1.pkl'
pickle.dump(hybrid_model, open(filename, 'wb')) 
hybrid_model.fit(X_train_scaled, y_train1)


y_pred = hybrid_model.predict(X_test_scaled)


accuracy = accuracy_score(y_test1, y_pred)
print("Hybrid Model Accuracy:", accuracy)

accuracy1 = accuracy_score(y_test1, y_pred)
precision1 = precision_score(y_test1, y_pred, average='weighted')
recall1 = recall_score(y_test1, y_pred, average='weighted')
f11 = f1_score(y_test1,y_pred,average='weighted')

# metrics1[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall , 'f1':f1}

# Print classification report
print("Classification Report:")
print(classification_report(y_test1, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test1, y_pred)
# plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=hybrid_model.classes_,
            yticklabels=hybrid_model.classes_)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

## We can see Hybrid Model is working good on balanced dataset instead of Inbalanced Dataset

# Training the hybrid model on best 7 features

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=1)
lr = LogisticRegression(random_state=1)

hybrid_model = VotingClassifier(estimators=[('rf', rf), ('lr', lr)], voting='soft')


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# SelectKBest to select the best 10 features
selector = SelectKBest(score_func=f_classif, k=7)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Get the names of the selected features
selected_feature_names = X_train.columns[selector.get_support()]

# Train the model on the selected features
hybrid_model.fit(X_train_selected, y_train)

# Predict on the test set
y_pred = hybrid_model.predict(X_test_selected)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print selected feature names
print("Selected Features:", selected_feature_names)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=hybrid_model.classes_,
            yticklabels=hybrid_model.classes_)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
filename = 'scaler.pkl'
pickle.dump(scaler, open(filename, 'wb')) 

In [None]:
filename = 'hybrid_features.pkl'
pickle.dump(hybrid_model, open(filename, 'wb')) 

In [None]:
precision , recall ,f1 ,accuracy

In [None]:
accuracy_values = [metrics[algo]['Accuracy'] for algo in algos]

In [None]:
metrics

In [None]:
al = ["Logistic Regression ","SVM" ,"Random Forest","KNN" ,"Hybrid Model"]

In [None]:
precision_values = [metrics[algo]['Precision'] for algo in algos]
recall_values = [metrics[algo]['Recall'] for algo in algos]
f1_values = [metrics[algo]['f1'] for algo in algos]

In [None]:
precision_values


In [None]:
recall_values

In [None]:
f1_values

In [None]:
accuracy_values


In [None]:
acc_vals = [0.9748822605965463,
 0.9748822605965463,
 0.9921507064364207,
 0.9434850863422292, 0.989010989010989]

In [None]:
prec_vals = [0.9775669609794883,
 0.9754963215614182,
 0.9925072069644185,
 0.9404076722784118 ,0.9896211231745446] 

In [None]:
rec_vals=[0.9748822605965463,
 0.9748822605965463,
 0.9921507064364207,
 0.9434850863422292, 0.989010989010989]

In [None]:
f1_vals = [0.9734627547590529,
 0.9745478173570239,
 0.9920554087809106,
 0.9376609456335065 , 0.9887950346737808]

#COMPARISON of ALL MODELS 

## FOR BALANCED DATASET  

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(al, acc_vals, color='skyblue')
plt.xlabel('Algorithms')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Algorithms')
plt.ylim(0.9, 1.0)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(al, rec_vals, color='green')
plt.xlabel('Algorithms')
plt.ylabel('Recall')
plt.title('Recall of Different Algorithms')
plt.ylim(0.9, 1.0)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(al, f1_vals, color='orange')
plt.xlabel('Algorithms')
plt.ylabel('F1 Score')
plt.title('F1 score of Different Algorithms')
plt.ylim(0.9, 1.0)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(al, prec_vals, color='brown')
plt.xlabel('Algorithms')
plt.ylabel('Precision')
plt.title('Precision of Different Algorithms')
plt.ylim(0.9, 1.0)
plt.show()