In [None]:
# To upload our datasets from our working directory we need to mount our drive contents to the colab environment.
# For the code to do so you can search “mount” in code snippets or use the code given below.
# Our entire drive contents are now mounted on colab at the location “/gdrive”.

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
trainfile = r'/gdrive/My Drive/Hepatitis-Train.csv'
trainData = pd.read_csv(trainfile)  #creates a dataframe
testfile = r'/gdrive/My Drive/Hepatitis-Test.csv'
testData = pd.read_csv(testfile)  #creates a dataframe
print(trainData.shape)
print(testData.shape)

(134, 20)
(11, 20)


In [None]:
trainData.head()
#print("=======")
testData.head()

Unnamed: 0,Age,Sex,Steroid,Antivirals,Fatigue,Malaise,Anorexia,Liver Big,Liver Firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,ALK Phosphate,SGOT,Albumin,PROTIME,Histology,TARGET
0,54,Male,no,yes,no,no,yes,yes,yes,no,yes,no,yes,3.9,120,28,3.5,43.0,yes,1
1,49,Male,no,yes,no,no,yes,yes,yes,no,no,yes,yes,1.4,85,70,3.5,35.0,yes,1
2,45,Male,yes,yes,no,no,no,yes,yes,yes,no,no,yes,1.9,104,114,2.4,62.16,yes,1
3,41,Male,yes,yes,no,yes,yes,yes,no,no,no,yes,no,4.2,65,120,3.4,62.16,yes,1
4,46,Male,yes,yes,no,no,no,yes,yes,yes,no,no,no,7.6,104,242,3.3,50.0,yes,1


In [None]:
#Extract Target Column before doing missing value substitutions and one-hot encoding======
Target_Train_Cols = trainData["TARGET"]#make copy of target column
trainData = trainData.drop(["TARGET"], axis=1) #extracting training data without the target column

print(trainData.shape)

(134, 19)


In [None]:
Target_Test_Cols = testData["TARGET"]#make copy of target column
testData = testData.drop(["TARGET"], axis=1) #extracting training data without the target column

print(testData.shape)

(11, 19)


In [None]:
# Drop rows with missing values
#trainData.dropna(inplace=True)
#testData.dropna(inplace=True)

In [None]:
# Drop rows with missing values
trainData.dropna(inplace=True)
testData.dropna(inplace=True)

# One-Hot Encoding for Categorical Variables
trainData_encoded = pd.get_dummies(trainData)
testData_encoded = pd.get_dummies(testData)

# Print the columns after one-hot encoding
print("Columns after One-Hot Encoding for Training Data:")
print(trainData_encoded.columns)

print("\nColumns after One-Hot Encoding for Testing Data:")
print(testData_encoded.columns)

# Align the columns in the training and testing datasets after one-hot encoding
common_columns = trainData_encoded.columns.intersection(testData_encoded.columns)

X_train = trainData_encoded[common_columns]
X_test = testData_encoded[common_columns]



Columns after One-Hot Encoding for Training Data:
Index(['Age', 'Bilirubin', 'ALK Phosphate', 'SGOT', 'Albumin', 'PROTIME', 'Sex_Female', 'Sex_Male', 'Steroid_no', 'Steroid_yes', 'Antivirals_no', 'Antivirals_yes', 'Fatigue_no', 'Fatigue_yes', 'Malaise_no', 'Malaise_yes', 'Anorexia_no', 'Anorexia_yes', 'Liver Big_no', 'Liver Big_yes', 'Liver Firm_no', 'Liver Firm_yes', 'Spleen Palpable_no', 'Spleen Palpable_yes', 'Spiders_no', 'Spiders_yes', 'Ascites_no', 'Ascites_yes', 'Varices_no', 'Varices_yes', 'Histology_no', 'Histology_yes'], dtype='object')

Columns after One-Hot Encoding for Testing Data:
Index(['Age', 'Bilirubin', 'ALK Phosphate', 'SGOT', 'Albumin', 'PROTIME', 'Sex_Male', 'Steroid_no', 'Steroid_yes', 'Antivirals_yes', 'Fatigue_no', 'Fatigue_yes', 'Malaise_no', 'Malaise_yes', 'Anorexia_no', 'Anorexia_yes', 'Liver Big_no', 'Liver Big_yes', 'Liver Firm_no', 'Liver Firm_yes', 'Spleen Palpable_no', 'Spleen Palpable_yes', 'Spiders_no', 'Spiders_yes', 'Ascites_no', 'Ascites_yes', 'Var

In [None]:
# Split the data into features and target
#X_train = trainData
#y_train = Target_Train_Cols
#X_test = testData
#y_test = Target_Test_Cols

In [None]:
# LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
linear_svc_predictions = linear_svc.predict(X_test)



In [None]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
decision_tree_predictions = decision_tree.predict(X_test)

In [None]:
# Random Forest
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
random_forest_predictions = random_forest.predict(X_test)

In [None]:
# K-Nearest Neighbor
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)

In [None]:
# (2) Record average accuracy, precision, recall, and F1 score on the test set for each classifier

classifiers = [linear_svc, decision_tree, random_forest, knn]
classifier_names = ['LinearSVC', 'Decision Tree', 'Random Forest', 'K-Nearest Neighbor']

results = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

for i, classifier in enumerate(classifiers):
    predictions = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    results = results.append({
        'Classifier': classifier_names[i],
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }, ignore_index=True)

# Display results
print(results)

# Save the results to a CSV file in your Google Drive
results.to_csv('/gdrive/MyDrive/classification_results.csv', index=False)


  results = results.append({
  results = results.append({
  results = results.append({


           Classifier  Accuracy  Precision  Recall  F1 Score
0           LinearSVC  0.818182   1.000000     0.6  0.750000
1       Decision Tree  0.909091   0.833333     1.0  0.909091
2       Random Forest  0.727273   0.666667     0.8  0.727273
3  K-Nearest Neighbor  0.454545   0.454545     1.0  0.625000


  results = results.append({


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the Random Forest classifier
random_forest = RandomForestClassifier()

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=random_forest,
    param_distributions=param_grid,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='accuracy',  # You can choose other metrics based on your problem
    cv=5,  # Number of folds for cross-validation
    random_state=42
)

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Use the best estimator for predictions
best_random_forest = random_search.best_estimator_
random_forest_predictions = best_random_forest.predict(X_test)

# Evaluate the performance of the best model
accuracy = accuracy_score(y_test, random_forest_predictions)
precision = precision_score(y_test, random_forest_predictions)
recall = recall_score(y_test, random_forest_predictions)
f1 = f1_score(y_test, random_forest_predictions)

# Display the evaluation metrics
print("\nEvaluation Metrics for Best Random Forest Model:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 20}

Evaluation Metrics for Best Random Forest Model:
Accuracy: 0.7272727272727273
Precision: 0.625
Recall: 1.0
F1 Score: 0.7692307692307693


In [None]:
# Retrieve the best hyperparameters found
best_params = random_search.best_params_

# Describe the best parameters
print("Best Hyperparameters Found:")
print("Number of Trees (n_estimators):", best_params['n_estimators'])
print("Maximum Depth of Tree (max_depth):", best_params['max_depth'])
print("Minimum Samples Split (min_samples_split):", best_params['min_samples_split'])
print("Minimum Samples Leaf (min_samples_leaf):", best_params['min_samples_leaf'])

# Construct a Random Forest classifier with the best parameters
best_random_forest = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf']
)

# Fit the best model to the training data
best_random_forest.fit(X_train, y_train)

# Make predictions on the test set
best_random_forest_predictions = best_random_forest.predict(X_test)

# Evaluate the performance of the best model
accuracy = accuracy_score(y_test, best_random_forest_predictions)
precision = precision_score(y_test, best_random_forest_predictions)
recall = recall_score(y_test, best_random_forest_predictions)
f1 = f1_score(y_test, best_random_forest_predictions)

# Display the evaluation metrics
print("\nEvaluation Metrics for Best Random Forest Model:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Best Hyperparameters Found:
Number of Trees (n_estimators): 200
Maximum Depth of Tree (max_depth): 20
Minimum Samples Split (min_samples_split): 5
Minimum Samples Leaf (min_samples_leaf): 4

Evaluation Metrics for Best Random Forest Model:
Accuracy: 0.5454545454545454
Precision: 0.5
Recall: 0.8
F1 Score: 0.6153846153846154


In [None]:
# Record the evaluation metrics for the hyperparameter-tuned Random Forest classifier
tuned_random_forest_metrics = {
    'Classifier': 'Tuned Random Forest',
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
}

# Append the metrics to the existing results DataFrame
results = results.append(tuned_random_forest_metrics, ignore_index=True)

# Display the updated results
print(results)

# Save the updated results to a CSV file in your Google Drive
results.to_csv('/gdrive/MyDrive/classification_results1.csv', index=False)


            Classifier  Accuracy  Precision  Recall  F1 Score
0            LinearSVC  0.818182   1.000000     0.6  0.750000
1        Decision Tree  0.909091   0.833333     1.0  0.909091
2        Random Forest  0.727273   0.666667     0.8  0.727273
3   K-Nearest Neighbor  0.454545   0.454545     1.0  0.625000
4  Tuned Random Forest  0.545455   0.500000     0.8  0.615385


  results = results.append(tuned_random_forest_metrics, ignore_index=True)


In [None]:
# Print the results DataFrame
print(results)

# Identify the row index for the Random Forest results
rf_default_index = results[results['Classifier'] == 'Random Forest'].index[0]
rf_tuned_index = results[results['Classifier'] == 'Tuned Random Forest'].index[0]

# Extract metrics for the Random Forest (default and tuned)
rf_default_metrics = results.loc[rf_default_index, ['Accuracy', 'Precision', 'Recall', 'F1 Score']]
rf_tuned_metrics = results.loc[rf_tuned_index, ['Accuracy', 'Precision', 'Recall', 'F1 Score']]

# Print the comparison
print("\nComparison between Default and Tuned Random Forest:")
print("Default Metrics:", rf_default_metrics.values)
print("Tuned Metrics:", rf_tuned_metrics.values)

            Classifier  Accuracy  Precision  Recall  F1 Score
0            LinearSVC  0.818182   1.000000     0.6  0.750000
1        Decision Tree  0.909091   0.833333     1.0  0.909091
2        Random Forest  0.727273   0.666667     0.8  0.727273
3   K-Nearest Neighbor  0.454545   0.454545     1.0  0.625000
4  Tuned Random Forest  0.545455   0.500000     0.8  0.615385

Comparison between Default and Tuned Random Forest:
Default Metrics: [0.7272727272727273 0.6666666666666666 0.8 0.7272727272727272]
Tuned Metrics: [0.5454545454545454 0.5 0.8 0.6153846153846154]


In [None]:
# Assuming you have the 'best_random_forest' model trained on your data

# Get feature importances from the trained Random Forest model
feature_importances = best_random_forest.feature_importances_

# Create a DataFrame to associate features with their importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the top 5 features
top_features = feature_importance_df.head(5)
print("Top 5 Features:")
print(top_features)


Top 5 Features:
        Feature  Importance
4       Albumin    0.168479
1     Bilirubin    0.121402
5       PROTIME    0.089891
25  Ascites_yes    0.061158
27  Varices_yes    0.060003


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming you have trained classifiers and corresponding test sets
classifiers = [linear_svc, decision_tree, random_forest, knn]
X_test_classifiers = [X_test, X_test, X_test, X_test]

# Fit the RandomForestClassifier before making predictions
random_forest.fit(X_train, y_train)

# Get predictions from each classifier
predictions = [classifier.predict(X_test_classifier) for classifier, X_test_classifier in zip(classifiers, X_test_classifiers)]

# Create a DataFrame to store individual classifier predictions
ensemble_predictions_df = pd.DataFrame({
    'LinearSVC': predictions[0],
    'DecisionTree': predictions[1],
    'RandomForest': predictions[2],
    'KNN': predictions[3]
})

# Initialize the MLP classifier for stacking
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# Fit the MLP classifier on the individual classifier predictions
mlp_classifier.fit(ensemble_predictions_df, y_test)

# Make predictions using the ensemble model
ensemble_predictions = mlp_classifier.predict(ensemble_predictions_df)

# Evaluate the performance of the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
ensemble_precision = precision_score(y_test, ensemble_predictions)
ensemble_recall = recall_score(y_test, ensemble_predictions)
ensemble_f1 = f1_score(y_test, ensemble_predictions)

# Display the evaluation metrics for the ensemble model
print("\nEnsemble Model Evaluation Metrics:")
print("Accuracy:", ensemble_accuracy)
print("Precision:", ensemble_precision)
print("Recall:", ensemble_recall)
print("F1 Score:", ensemble_f1)




Ensemble Model Evaluation Metrics:
Accuracy: 0.9090909090909091
Precision: 0.8333333333333334
Recall: 1.0
F1 Score: 0.9090909090909091
