# Prediction Analysis

# Import the necessary packages

In [3]:
# Import the necessary packages
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np


# Prepare the dataset

In [4]:
try:
    data = pd.read_csv('merged_dataset_NIH.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("File not found. Please check the file path.")
except pd.errors.EmptyDataError:
    print("No data found. Please check the file.")
except Exception as e:
    print(f"An error occurred: {e}")

print(data.head())
print(data.info())
print(data.describe())

# Delete ID and Target variable
X = data.drop(['ID', 'NIH'], axis=1)
X = X.select_dtypes(include=[np.number])

# Target variable y 
y = data['NIH']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Data loaded successfully.
                    ID sex  interview_age  NIH race_group   V1     V2    V3  \
0  sub-NDARINVRL863G1R   F            155    1  rg5_asian  0.0  297.0  46.0   
1  sub-NDARINV18YX7994   F            149    1  rg5_asian  0.0   34.0  37.0   
2  sub-NDARINV58VGXCL7   M            154    1  rg1_white  0.0  138.0  10.0   
3  sub-NDARINVYYBL28AM   M            149    1  rg1_white  0.0  122.0   9.0   
4  sub-NDARINVNNA7JH41   M            148    1  rg1_white  0.0  181.0  23.0   

      V4    V5  ...  V3732  V3733   V3734   V3735  V3736  V3737  V3738  V3739  \
0   43.0   0.0  ...    0.0    0.0   786.0  3213.0   16.0    0.0  198.0    0.0   
1   48.0  38.0  ...    0.0    0.0   499.0  2878.0    0.0    0.0    0.0    0.0   
2  140.0  22.0  ...    0.0   19.0  1181.0  4171.0   63.0    0.0  393.0    0.0   
3   71.0  16.0  ...   12.0    0.0   150.0  1838.0    0.0    0.0   78.0    0.0   
4   33.0  16.0  ...    6.0    0.0   311.0  2704.0    6.0    0.0  200.0    0.0   

    V3740  V

# Define the parameters

In [5]:
# You can change them based on your own dataset
n_components = 50  # Number of principal components for PCA
n_estimators = 100  # Number of trees in the Random Forest
threshold = "median"  # Threshold for feature selection, retaining half of the features


# SVM with linear kernel

In [None]:
# Define the pipeline for SVM with linear kernel
pipeline_svm_lin = Pipeline(steps=[
    ('pca', PCA(n_components=n_components)),  # PCA for dimensionality reduction
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=n_estimators), threshold=threshold)),  # Feature selection
    ('classifier', SVC(kernel='linear', probability=True))  # SVM with linear kernel
])

# Define the parameter grid for SVM with linear kernel
param_grid_SVMlin = {
    'classifier__C': [0.01, 0.05, 0.1, 1, 10, 100]
}

# Perform grid search for SVM with linear kernel
grid_search_svm_lin = GridSearchCV(pipeline_svm_lin, param_grid_SVMlin, cv=10, scoring='accuracy', verbose=2)
grid_search_svm_lin.fit(X_train, y_train)

# Print the best parameters
print("Best parameters for SVM with linear kernel: ", grid_search_svm_lin.best_params_)

# Get the best model
best_model_svm_lin = grid_search_svm_lin.best_estimator_

# Predict the test set
y_pred_svm_lin = best_model_svm_lin.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_svm_lin))

# Calculate and print the accuracy
accuracy_svm_lin = accuracy_score(y_test, y_pred_svm_lin)
print("Accuracy for SVM with linear kernel:", accuracy_svm_lin)


#  Logistic Regression (LASSO and Ridge Regression)

In [6]:
# Define the pipeline for Logistic Regression
pipeline_logreg = Pipeline(steps=[
    ('pca', PCA(n_components=n_components)),  # PCA for dimensionality reduction
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=n_estimators), threshold=threshold)),  # Feature selection
    ('classifier', LogisticRegression(solver='liblinear', max_iter=10000))  # Logistic Regression
])

# Define the parameter grid for Logistic Regression
param_grid_logistic = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'classifier__penalty': ['l1', 'l2']  # L1_Lasso, L2_Ridge
}

# Perform grid search for Logistic Regression
grid_search_logreg = GridSearchCV(pipeline_logreg, param_grid_logistic, cv=10, scoring='accuracy', verbose=2)
grid_search_logreg.fit(X_train, y_train)

# Print the best parameters
print("Best parameters for Logistic Regression: ", grid_search_logreg.best_params_)

# Get the best model
best_model_logreg = grid_search_logreg.best_estimator_

# Predict the test set
y_pred_logreg = best_model_logreg.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_logreg))

# Calculate and print the accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Accuracy for Logistic Regression:", accuracy_logreg)


Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.5s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.4s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.1s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.1s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.2s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.8s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.0s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.2s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.6s
[CV] END ........classifier__C=0.001, classifier__penalty=l1; total time=   1.2s
[CV] END ........classifier__C=0.001, classifier__penalty=l2; total time=   1.3s
[CV] END ........classifier__C=0.001, classifi

# SVM with RBF kernel

In [None]:
# Define the pipeline for SVM with RBF kernel
pipeline_svm_rbf = Pipeline(steps=[
    ('pca', PCA(n_components=n_components)),  # PCA for dimensionality reduction
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=n_estimators), threshold=threshold)),  # Feature selection
    ('classifier', SVC(kernel='rbf', probability=True))  # SVM with RBF kernel
])

# Define the parameter grid for SVM with RBF kernel
param_grid_SVMRBF = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10, 100]
}

# Perform grid search for SVM with RBF kernel
grid_search_svm_rbf = GridSearchCV(pipeline_svm_rbf, param_grid_SVMRBF, cv=10, scoring='accuracy', verbose=2)
grid_search_svm_rbf.fit(X_train, y_train)

# Print the best parameters
print("Best parameters for SVM with RBF kernel: ", grid_search_svm_rbf.best_params_)

# Get the best model
best_model_svm_rbf = grid_search_svm_rbf.best_estimator_

# Predict the test set
y_pred_svm_rbf = best_model_svm_rbf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_svm_rbf))

# Calculate and print the accuracy
accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
print("Accuracy for SVM with RBF kernel:", accuracy_svm_rbf)


# SVM with Polynomial kernel

In [None]:
# Define the pipeline for SVM with Polynomial kernel
pipeline_svm_poly = Pipeline(steps=[
    ('pca', PCA(n_components=n_components)),  # PCA for dimensionality reduction
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=n_estimators), threshold=threshold)),  # Feature selection
    ('classifier', SVC(kernel='poly', probability=True))  # SVM with Polynomial kernel
])

# Define the parameter grid for SVM with Polynomial kernel
param_grid_Poly = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__degree': [2, 3, 4, 5],
    'classifier__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10],
    'classifier__coef0': [0.0, 0.1, 0.5, 1.0]  # Independent term in kernel function
}

# Perform grid search for SVM with Polynomial kernel
grid_search_svm_poly = GridSearchCV(pipeline_svm_poly, param_grid_Poly, cv=10, scoring='accuracy', verbose=2)
grid_search_svm_poly.fit(X_train, y_train)

# Print the best parameters
print("Best parameters for SVM with Polynomial kernel: ", grid_search_svm_poly.best_params_)

# Get the best model
best_model_svm_poly = grid_search_svm_poly.best_estimator_

# Predict the test set
y_pred_svm_poly = best_model_svm_poly.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_svm_poly))

# Calculate and print the accuracy
accuracy_svm_poly = accuracy_score(y_test, y_pred_svm_poly)
print("Accuracy for SVM with Polynomial kernel:", accuracy_svm_poly)
