In [None]:
import pandas as pd
import matplotlib.pyplot as plt


df1= pd.read_csv('../Disease-Prediction-Machine-Learning/data/new_dataset1.csv')
df1.head()

In [None]:
df2=pd.read_csv('../Disease-Prediction-Machine-Learning/data/new_dataset2.csv')
df2.head()


# 2.4 - Välja modell
Välj 3-5 maskininlärningsmodeller, gärna så olika som möjligt. För varje dataset som vi skapade i uppgift 2.3
gör följande:

- train|validation|test split
- skala datasetet med feature standardization och normalization (de görs inte samtidigt, utan i olika omgångar)
- definiera hyperparametrar (param_grids) att testa för varje modell
- använda GridSearchCV() och välja lämplig evalueringsmetric
- gör prediction på valideringsdata
- beräkna och spara evaluation score för ditt valda metric
- checka bästa parametrarna för respektive modell



### Choosen models are:
- 1. Logistic Regression
- 2. KNN 
- 3. Decision Tree
- 4. Random Forest
- 5. Gaussian Naive Bayes

### 1. Logistic Regression: 
###### https://en.wikipedia.org/wiki/Logistic_regression
- Logistic regression is a statistical model used for binary classification problems, which means it predicts the probability event taking place based on a set of input features. 
- Based on data it predicts that a patient having a disease or not based on features such as age, gender, blood pressure,BMI etc.Logistic function has a relationship between the input features and the output variable.
- It is a widely used algorithm in various fields such as medicine, social sciences and economics.

### 2. K-Nearest Neighbors (KNN):
###### https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
- KNN  is a non-parametric supervised machine learning algorithm used for both classification and regression tasks.
- In this data, KNN can be used for predicting whether a patient has cardiovascular disease or not based on their health metrics.
- It identifs the k closest data points (neighbors) in the training set to the given input data point. The algorithm then assigns the output value of the input data point based on the majority class of the k-nearest neighbors.
- It is a widely used algorithm in various fields such as medicine, facial recognition,finance,text mining and recommendation systems.

### 3. Decision Tree:
###### https://en.wikipedia.org/wiki/Decision_tree
- A decision tree is a supervised machine learning algorithm which is easy to understand, as it creates a tree-like structure that can be interpreted and visualized.
- It can handle both categorical and numerical data and can be used for both classification (i.e., predicting a binary outcome such as "disease" or "no disease") and regression (i.e., predicting a continuous outcome such as blood pressure or cholesterol levels).
- It is a widely used algorithm in various fields such as engineering, fraud detection,credit risk analysis and medical diagnosis.

### 4. Random Forest:
###### https://en.wikipedia.org/wiki/Random_forest
- Random forest is a machine learning algorithm used for classification, regression, and other tasks. 
- It is an ensemble method that creates multiple decision trees and combines their outputs to make a final prediction.
- In this algorithm,each tree is trained on a random subset of the original dataset and a random subset of the features, which helps to reduce overfitting and increase the models accuracy. 
- It is a widely used algorithm in various fields such as medicine, finance,marketing,image and speech recognition.

### 5. Gaussian Naive Bayes:
###### https://en.wikipedia.org/wiki/Naive_Bayes_classifier
- Gaussian Naive Bayes is a probabilistic classification algorithm based on Bayes theorem, which describes the probability of a hypothesis based on prior knowledge and new evidence.
-  It is a widely used algorithm in various fields such as text classification, spam filtering, image recognitionand medical diagnosis.

## For first data df1:

- train|validation|test split

In [None]:
print('Number of rows and columns of first dataframe:',df1.shape)

In [None]:
from testing_models import split_data

# For df1 data train |val|test split data
X_train1, X_val1, X_test1, y_train1, y_val1, y_test1 = split_data(
    df=df1, target_col="cardio",test_size=0.2, random_state=42)


print(
    f"{X_train1.shape = }\n{X_val1.shape = }\n{X_test1.shape = }\n{y_train1.shape = }\n{y_val1.shape = }\n{y_test1.shape = }\n"
)


- skala datasetet med feature standardization och normalization

#### Feature scaling is standard

In [None]:
from testing_models import scale_features

# standad scaling for dataset 1
pipelines = scale_features(scale_type='standard')
pipelines

- define hyperparameters for choosen models

In [None]:

# Logistic Regression hyperparameters
log_param_grid = [ {'LR__C': [0.01, 0.1, 1, 10, 100], 'LR__penalty': ['l2'], 'LR__solver': ['lbfgs','newton-cg','sag','saga'],'LR__max_iter': [1000, 5000, 10000]}]

# KNN hyperparameters
knn_param_grid = [{'KNN__n_neighbors': [3, 5, 7, 9, 11], 'KNN__weights': ['uniform', 'distance']}]

# Decision Tree hyperparameters
tree_param_grid = [{'DT__max_depth': [5, 10, 20], 'DT__min_samples_split': [2, 5, 10]}]


# Random Forest hyperparameters
forest_param_grid = [{'RF__n_estimators': [10, 50, 100, 200], 'RF__max_depth': [5, 10, 20], 'RF__min_samples_split': [2, 5, 10]}]


# GaussianNB hyperparameters
#Gaussian_param_grid = [{'NB__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}]

In [None]:
param_grids = {
    'Logistic Regression': log_param_grid,
    'K-Nearest Neighbor': knn_param_grid,
    'Decision Tree': tree_param_grid,
    'Random Forest': forest_param_grid,
    #'Gaussian Naive Bayes': Gaussian_param_grid
}

In [8]:
from testing_models import grid_search, evaluate_classification

for model_name, pipeline in pipelines.items():
    param_grid = param_grids[model_name]
    print('=============================================\n')
    print(f'{model_name:}\n')
    score_file='results/accuracy_scores.txt'
    y_pred=grid_search(pipeline, param_grid, X_train1, y_train1, X_val1, y_val1,'dataset1_standard',score_file)
    evaluate_classification(model_name,y_val1, y_pred)
    

### Feature scaling is minmax

In [None]:
# minmax scaling for dataset 1
pipelines= scale_features(scale_type='minmax')
pipelines

In [None]:
from testing_models import grid_search, evaluate_classification
for model_name, pipeline in pipelines.items():
    param_grid = param_grids[model_name]
    print('==============================================\n')
    print(f'{model_name:}\n')
    grid_search(pipeline, param_grid, X_train1, y_train1, X_val1, y_val1,'dataset1_minmax',score_file)
    evaluate_classification(model_name,y_val1, y_pred)
    

## For first data df2:

- train|validation|test split

In [None]:
print('Number of rows and columns of second dataframe:',df2.shape)

In [None]:
from testing_models import split_data,scale_features

# For df1 data train |val|test split data
X_train2, X_val2, X_test2, y_train2, y_val2, y_test2 = split_data(
    df=df2, target_col="cardio", test_size=0.2, random_state=42
)

print(
    f"{X_train2.shape = }\n{X_val2.shape = }\n{X_test2.shape = }\n{y_train2.shape = }\n{y_val2.shape = }\n{y_test2.shape = }\n"
)


#### Feature scaling is standard

In [None]:
# standad scaling for dataset 2
pipelines_stand = scale_features(scale_type='standard')
pipelines_stand

In [None]:

for name, pipeline in pipelines_stand.items():
    param_grid = param_grids[name]
    print(' ===============================================\n')
    print(f'{name:}\n')
    grid_search(pipeline, param_grid, X_train2, y_train2, X_val2, y_val2,'dataset2_standard',score_file)
    evaluate_classification(name,y_val2, y_pred)
    

#### Feature scaling is minmax


In [None]:
# Min max scaling for dataset 2
pipelines_max = scale_features(scale_type='minmax')
pipelines_max

In [None]:

for name, pipeline in pipelines_max.items():
    param_grid = param_grids[name]
    print(' ==================================================\n')
    print(f'{name:}\n')
    grid_search(pipeline, param_grid, X_train2, y_train2, X_val2, y_val2,'dataset2_minax',score_file)
    evaluate_classification(name,y_val2, y_pred)
    

Vilket dataset väljer du och vilken modell väljer du? Använd den modellen du valt och träna på all data förutom testdatan.


###### https://www.analyticsvidhya.com/blog/2019/08/11-important-model-evaluation-error-metrics/
- Accuracy is a common metric used to measure overall model performance but it can be misleading if the dataset is imbalanced which means one class is much more prevalent than the other. In such cases we can consider precision, recall, and F1 score can be more informative.
- So here F1 score and accuracy are consider to measure the model performance.

- Based on the results that are saved in accuracy scores  text file, it seems that the Random Forest classifier is the best model because it achieved the highest F1 score and accuracy on both datasets.

- While in datasets, it seems that dataset2_standard performed better than dataset1_standard as it achieved higher F1 and accuracy scores across all models.

In [None]:
# for dataset2 with standard scale: Random forest classifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix,ConfusionMatrixDisplay


# pipline and hyperparameters for Random Forest
random_pipeline = Pipeline(
    [("scaler", StandardScaler()), ("rf", RandomForestClassifier())]
)
param_grid_rf = {
    "rf__max_depth": [10],
    "rf__min_samples_split": [10],
    "rf__n_estimators": [100],
}

grid_search = GridSearchCV(
    estimator=random_pipeline,
    param_grid=param_grid_rf,
    scoring="f1",
    cv=3,
    verbose=1,
    error_score="raise",
)


# to fit that object to training data
grid_search.fit(X_train2, y_train2)


# predictions on test data
y_pred = grid_search.predict(X_test2)



# 2.5 Ensemble
Använd VotingClassifier() på datasetet som du valt och lägg in de bästa parametrarna för respektive
modell

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Create the voting classifier for 5 models with their best parameters 
vote_clf = VotingClassifier(
    [
        ("LR", LogisticRegression(C=100, max_iter=1000, penalty ='l2', solver='saga')),
        ("KNN", KNeighborsClassifier(n_neighbors = 11, weights='distance')),
        ("DT", DecisionTreeClassifier(max_depth=10,min_samples_split=2)),
        ("RF", RandomForestClassifier(max_depth=10,min_samples_split=10,n_estimators=100)),
        ("NB", GaussianNB(var_smoothing = 1e-09)),
    ],
    voting="hard",
)


# Train the voting classifier
vote_clf.fit(X_train2, y_train2)

# Evaluate the accuracy of the voting classifier on the test set
y_pred_clf = vote_clf.predict(X_test2)
accuracy = accuracy_score(y_test2, y_pred_clf)
print(f'Accuracy of the voting classifier: {accuracy*100:.3f}%')


# 2.6 Evalueringar
Gör confusion matrices och classification reports för 2.4 och 2.5.

In [None]:
def evaluate_model(model):
    model.fit(X_train2, y_train2)
    y_pred = model.predict(X_test2)

    print(classification_report(y_test2, y_pred))
    cm = confusion_matrix(y_test2, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=["Yes", "No"]).plot()

In [None]:
# For daataset 2 with random forest
evaluate_model(RandomForestClassifier())

In [None]:

# for voting classifier
evaluate_model(vote_clf)


# 2.7 "Deploy" - spara modell
Börja med att plocka ut 100 slumpmässigt valda rader från ditt dataset. Exportera dessa 100 samples i
test_samples.csv. Därefter tar du den bästa modellen och träna på all data vi har förutom de 100
datapunkterna du plockade ut. Spara därefter modellen i en .pkl-fil med hjälp av joblib.dump(). För
modellen kan du behöva använda argumentet compress för att komprimera om filstorleken för stor.

In [None]:
# Pick 100 randomly selected rows from dataset 2
random_samples= df2.sample(n=100, random_state=42)
random_samples.to_csv('data/test_samples.csv', index=False)


# train on remaining data except picked 100 datapoints
df = df2.drop(random_samples.index)
X = df.drop('cardio', axis=1)
y = df['cardio']

