# Step 1: Data Collection and Exploration


In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
file_path = 'malaria_data.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()

# Step 2: Data Preprocessing


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# No missing values, proceed with normalization

# Normalize numerical features (if needed)
# For simplicity, let's assume the features are already in a suitable range

# Display the updated dataset
df.head()

# Step 3: Feature Selection


In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Display the correlation matrix
correlation_matrix

# Step 4: Train-Test Split


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df.drop('Malaria_Presence', axis=1)
y = df['Malaria_Presence']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Step 5: Model Training


In [None]:
from sklearn.linear_model import LogisticRegression

# Create and train a Logistic Regression classifier
logreg_classifier = LogisticRegression(random_state=42)
logreg_classifier.fit(X_train, y_train)

# Step 6: Model Evaluation


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Make predictions on the testing set
y_pred = logreg_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, logreg_classifier.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)

# Display evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)

# Step 7: Fine-Tuning and Optimization


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for Logistic Regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Create a Logistic Regression classifier
logreg_classifier_optimized = LogisticRegression(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(logreg_classifier_optimized,
                           param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Display the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Create and train a Logistic Regression classifier with the best hyperparameters
logreg_classifier_final = LogisticRegression(
    C=best_params['C'], random_state=42)
logreg_classifier_final.fit(X_train, y_train)

# Step 8: Interpretation of Results


In [None]:
# Display the coefficients of the Logistic Regression model
coefficients = logreg_classifier_final.coef_[0]
feature_names = X_train.columns

coefficients_df = pd.DataFrame(
    {'Feature': feature_names, 'Coefficient': coefficients})
coefficients_df = coefficients_df.sort_values(
    by='Coefficient', ascending=False)

# Display the sorted coefficients
coefficients_df

# Visualization: Bar plot of Coefficients


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coefficients_df,
            hue='Feature', palette='viridis', dodge=False)
plt.title('Logistic Regression Coefficients - Impact on Disease Presence')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.legend(title=None)
plt.show()