# Solution 1: Creatinine Prediction
## Objective: Predict serum creatinine levels using selected features

### Features Used:
- Age
- Albumin (urine)
- RBC (Red Blood Cells - nominal)
- Pus Cell (nominal)
- Bacteria (nominal)
- Urine pH

### Models:
1. Perceptron (baseline)
2. XGBoost (ensemble)
3. CatBoost (ensemble)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Data Loading and Initial Exploration

In [None]:
# Load dataset from UCI repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00336/chronic_kidney_disease.arff"

# Since it's an ARFF file, we'll use scipy or manually parse it
from scipy.io import arff
import io
import requests

# Download and load the ARFF file
response = requests.get(url)
data, meta = arff.loadarff(io.BytesIO(response.content))
df = pd.DataFrame(data)

# Decode byte strings to regular strings
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].str.decode('utf-8')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Display dataset information
print("Dataset Info:")
df.info()
print("\nColumn Names:")
print(df.columns.tolist())

In [None]:
# Display statistical summary
print("Statistical Summary:")
df.describe()

## 2. Feature Selection and Data Preprocessing

In [None]:
# Select required features based on specification
# Features: Age, Albumin (al - urine), RBC (rbc), Pus Cell (pc), Bacteria (ba), pH (ph)
# Target: Serum Creatinine (sc)

selected_features = ['age', 'al', 'rbc', 'pc', 'ba', 'ph']
target = 'sc'

# Create working dataframe with selected features and target
df_work = df[selected_features + [target]].copy()

print("Working Dataset Shape:", df_work.shape)
print("\nMissing Values:")
print(df_work.isnull().sum())
print("\nMissing Values Percentage:")
print((df_work.isnull().sum() / len(df_work)) * 100)

In [None]:
# Check for '?' or other missing value indicators
print("Checking for '?' or '\\t?' values:")
for col in df_work.columns:
    if df_work[col].dtype == object:
        unique_vals = df_work[col].unique()
        print(f"\n{col}: {unique_vals}")

In [None]:
# Replace missing value indicators with NaN
df_work = df_work.replace(['?', '\t?', ' ?', '? '], np.nan)

# Convert numeric columns to appropriate types
numeric_cols = ['age', 'al', 'ph', 'sc']
for col in numeric_cols:
    df_work[col] = pd.to_numeric(df_work[col], errors='coerce')

print("Missing Values After Conversion:")
print(df_work.isnull().sum())
print("\nData Types:")
print(df_work.dtypes)

In [None]:
# Handle missing values
# For numeric features: impute with median
# For categorical features: impute with mode

from sklearn.impute import SimpleImputer

# Numeric imputation
numeric_imputer = SimpleImputer(strategy='median')
df_work[numeric_cols] = numeric_imputer.fit_transform(df_work[numeric_cols])

# Categorical imputation
categorical_cols = ['rbc', 'pc', 'ba']
categorical_imputer = SimpleImputer(strategy='most_frequent')
df_work[categorical_cols] = categorical_imputer.fit_transform(df_work[categorical_cols].values.reshape(-1, len(categorical_cols)))

print("Missing Values After Imputation:")
print(df_work.isnull().sum())

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Distribution of target variable (Serum Creatinine)
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.hist(df_work['sc'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Serum Creatinine (mg/dL)')
plt.ylabel('Frequency')
plt.title('Distribution of Serum Creatinine')

plt.subplot(1, 3, 2)
plt.boxplot(df_work['sc'])
plt.ylabel('Serum Creatinine (mg/dL)')
plt.title('Box Plot of Serum Creatinine')

plt.subplot(1, 3, 3)
from scipy import stats
stats.probplot(df_work['sc'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Serum Creatinine')

plt.tight_layout()
plt.show()

print("Target Variable Statistics:")
print(df_work['sc'].describe())

In [None]:
# Distribution of numeric features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].hist(df_work['age'], bins=20, edgecolor='black', alpha=0.7, color='skyblue')
axes[0, 0].set_xlabel('Age (years)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Age')

axes[0, 1].hist(df_work['al'], bins=20, edgecolor='black', alpha=0.7, color='lightcoral')
axes[0, 1].set_xlabel('Albumin (urine)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Albumin')

axes[1, 0].hist(df_work['ph'], bins=20, edgecolor='black', alpha=0.7, color='lightgreen')
axes[1, 0].set_xlabel('Urine pH')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Urine pH')

# Categorical feature distribution
categorical_counts = pd.DataFrame({
    'RBC': df_work['rbc'].value_counts(),
    'Pus Cell': df_work['pc'].value_counts(),
    'Bacteria': df_work['ba'].value_counts()
})
categorical_counts.plot(kind='bar', ax=axes[1, 1], alpha=0.7)
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Distribution of Categorical Features')
axes[1, 1].legend(title='Features')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis for numeric features
numeric_features = ['age', 'al', 'ph', 'sc']
correlation_matrix = df_work[numeric_features].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numeric Features')
plt.show()

print("\nCorrelation with Target (Serum Creatinine):")
print(correlation_matrix['sc'].sort_values(ascending=False))

In [None]:
# Scatter plots: Numeric features vs Target
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].scatter(df_work['age'], df_work['sc'], alpha=0.5, color='blue')
axes[0].set_xlabel('Age (years)')
axes[0].set_ylabel('Serum Creatinine (mg/dL)')
axes[0].set_title('Age vs Serum Creatinine')

axes[1].scatter(df_work['al'], df_work['sc'], alpha=0.5, color='red')
axes[1].set_xlabel('Albumin (urine)')
axes[1].set_ylabel('Serum Creatinine (mg/dL)')
axes[1].set_title('Albumin vs Serum Creatinine')

axes[2].scatter(df_work['ph'], df_work['sc'], alpha=0.5, color='green')
axes[2].set_xlabel('Urine pH')
axes[2].set_ylabel('Serum Creatinine (mg/dL)')
axes[2].set_title('Urine pH vs Serum Creatinine')

plt.tight_layout()
plt.show()

In [None]:
# Box plots: Categorical features vs Target
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

df_work.boxplot(column='sc', by='rbc', ax=axes[0])
axes[0].set_xlabel('RBC')
axes[0].set_ylabel('Serum Creatinine (mg/dL)')
axes[0].set_title('RBC vs Serum Creatinine')

df_work.boxplot(column='sc', by='pc', ax=axes[1])
axes[1].set_xlabel('Pus Cell')
axes[1].set_ylabel('Serum Creatinine (mg/dL)')
axes[1].set_title('Pus Cell vs Serum Creatinine')

df_work.boxplot(column='sc', by='ba', ax=axes[2])
axes[2].set_xlabel('Bacteria')
axes[2].set_ylabel('Serum Creatinine (mg/dL)')
axes[2].set_title('Bacteria vs Serum Creatinine')

plt.tight_layout()
plt.show()

## 4. Feature Engineering and Encoding

In [None]:
# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_work[col + '_encoded'] = le.fit_transform(df_work[col])
    label_encoders[col] = le
    print(f"\n{col} encoding:")
    for i, label in enumerate(le.classes_):
        print(f"  {label} -> {i}")

# Create final feature set
feature_columns = ['age', 'al', 'ph', 'rbc_encoded', 'pc_encoded', 'ba_encoded']
X = df_work[feature_columns].copy()
y = df_work['sc'].copy()

print("\nFeature Matrix Shape:", X.shape)
print("Target Vector Shape:", y.shape)

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Set Size:", X_train.shape)
print("Test Set Size:", X_test.shape)

In [None]:
# Feature scaling (important for Perceptron)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled Training Set Shape:", X_train_scaled.shape)
print("Scaled Test Set Shape:", X_test_scaled.shape)

## 5. Model Training and Evaluation
### 5.1 Perceptron Model (Baseline)

In [None]:
# Train Perceptron model
# Note: Perceptron is typically for classification, but we'll adapt it for regression
# We'll use MLPRegressor instead, which is a neural network (Perceptron-based)
from sklearn.neural_network import MLPRegressor

perceptron_model = MLPRegressor(
    hidden_layer_sizes=(1,),  # Single neuron (perceptron)
    activation='identity',     # Linear activation for regression
    solver='sgd',              # Stochastic gradient descent
    max_iter=1000,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
)

perceptron_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_perceptron_train = perceptron_model.predict(X_train_scaled)
y_pred_perceptron_test = perceptron_model.predict(X_test_scaled)

# Evaluation metrics
print("Perceptron Model Performance:")
print("\nTraining Set:")
print(f"  MSE: {mean_squared_error(y_train, y_pred_perceptron_train):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_perceptron_train)):.4f}")
print(f"  MAE: {mean_absolute_error(y_train, y_pred_perceptron_train):.4f}")
print(f"  R² Score: {r2_score(y_train, y_pred_perceptron_train):.4f}")

print("\nTest Set:")
print(f"  MSE: {mean_squared_error(y_test, y_pred_perceptron_test):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_perceptron_test)):.4f}")
print(f"  MAE: {mean_absolute_error(y_test, y_pred_perceptron_test):.4f}")
print(f"  R² Score: {r2_score(y_test, y_pred_perceptron_test):.4f}")

### 5.2 XGBoost Model (Ensemble)

In [None]:
# Train XGBoost model
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb_train = xgb_model.predict(X_train)
y_pred_xgb_test = xgb_model.predict(X_test)

# Evaluation metrics
print("XGBoost Model Performance:")
print("\nTraining Set:")
print(f"  MSE: {mean_squared_error(y_train, y_pred_xgb_train):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_xgb_train)):.4f}")
print(f"  MAE: {mean_absolute_error(y_train, y_pred_xgb_train):.4f}")
print(f"  R² Score: {r2_score(y_train, y_pred_xgb_train):.4f}")

print("\nTest Set:")
print(f"  MSE: {mean_squared_error(y_test, y_pred_xgb_test):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_xgb_test)):.4f}")
print(f"  MAE: {mean_absolute_error(y_test, y_pred_xgb_test):.4f}")
print(f"  R² Score: {r2_score(y_test, y_pred_xgb_test):.4f}")

In [None]:
# Feature importance from XGBoost
feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('XGBoost Feature Importance')
plt.gca().invert_yaxis()
plt.show()

print("\nFeature Importance:")
print(feature_importance)

### 5.3 CatBoost Model (Ensemble)

In [None]:
# Train CatBoost model
catboost_model = CatBoostRegressor(
    iterations=100,
    learning_rate=0.1,
    depth=5,
    random_state=42,
    verbose=False
)

catboost_model.fit(X_train, y_train)

# Predictions
y_pred_cat_train = catboost_model.predict(X_train)
y_pred_cat_test = catboost_model.predict(X_test)

# Evaluation metrics
print("CatBoost Model Performance:")
print("\nTraining Set:")
print(f"  MSE: {mean_squared_error(y_train, y_pred_cat_train):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_cat_train)):.4f}")
print(f"  MAE: {mean_absolute_error(y_train, y_pred_cat_train):.4f}")
print(f"  R² Score: {r2_score(y_train, y_pred_cat_train):.4f}")

print("\nTest Set:")
print(f"  MSE: {mean_squared_error(y_test, y_pred_cat_test):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_cat_test)):.4f}")
print(f"  MAE: {mean_absolute_error(y_test, y_pred_cat_test):.4f}")
print(f"  R² Score: {r2_score(y_test, y_pred_cat_test):.4f}")

## 6. Model Comparison and Selection

In [None]:
# Compare all models
results = pd.DataFrame({
    'Model': ['Perceptron', 'XGBoost', 'CatBoost'],
    'Train_RMSE': [
        np.sqrt(mean_squared_error(y_train, y_pred_perceptron_train)),
        np.sqrt(mean_squared_error(y_train, y_pred_xgb_train)),
        np.sqrt(mean_squared_error(y_train, y_pred_cat_train))
    ],
    'Test_RMSE': [
        np.sqrt(mean_squared_error(y_test, y_pred_perceptron_test)),
        np.sqrt(mean_squared_error(y_test, y_pred_xgb_test)),
        np.sqrt(mean_squared_error(y_test, y_pred_cat_test))
    ],
    'Train_R2': [
        r2_score(y_train, y_pred_perceptron_train),
        r2_score(y_train, y_pred_xgb_train),
        r2_score(y_train, y_pred_cat_train)
    ],
    'Test_R2': [
        r2_score(y_test, y_pred_perceptron_test),
        r2_score(y_test, y_pred_xgb_test),
        r2_score(y_test, y_pred_cat_test)
    ]
})

print("Model Comparison:")
print(results)

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

results.plot(x='Model', y=['Train_RMSE', 'Test_RMSE'], kind='bar', ax=axes[0], alpha=0.7)
axes[0].set_ylabel('RMSE')
axes[0].set_title('Model Comparison - RMSE')
axes[0].legend(['Train RMSE', 'Test RMSE'])
axes[0].set_xticklabels(results['Model'], rotation=0)

results.plot(x='Model', y=['Train_R2', 'Test_R2'], kind='bar', ax=axes[1], alpha=0.7)
axes[1].set_ylabel('R² Score')
axes[1].set_title('Model Comparison - R² Score')
axes[1].legend(['Train R²', 'Test R²'])
axes[1].set_xticklabels(results['Model'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Prediction vs Actual plots for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Perceptron
axes[0].scatter(y_test, y_pred_perceptron_test, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Creatinine')
axes[0].set_ylabel('Predicted Creatinine')
axes[0].set_title(f'Perceptron\n(R² = {r2_score(y_test, y_pred_perceptron_test):.4f})')

# XGBoost
axes[1].scatter(y_test, y_pred_xgb_test, alpha=0.5)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Creatinine')
axes[1].set_ylabel('Predicted Creatinine')
axes[1].set_title(f'XGBoost\n(R² = {r2_score(y_test, y_pred_xgb_test):.4f})')

# CatBoost
axes[2].scatter(y_test, y_pred_cat_test, alpha=0.5)
axes[2].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[2].set_xlabel('Actual Creatinine')
axes[2].set_ylabel('Predicted Creatinine')
axes[2].set_title(f'CatBoost\n(R² = {r2_score(y_test, y_pred_cat_test):.4f})')

plt.tight_layout()
plt.show()

## 7. Save Best Model and Predictions for Solution 2

In [None]:
# Select best model based on test R² score
best_model_idx = results['Test_R2'].idxmax()
best_model_name = results.loc[best_model_idx, 'Model']

print(f"Best Model: {best_model_name}")
print(f"Test R² Score: {results.loc[best_model_idx, 'Test_R2']:.4f}")
print(f"Test RMSE: {results.loc[best_model_idx, 'Test_RMSE']:.4f}")

# Select best model predictions
if best_model_name == 'Perceptron':
    best_predictions = y_pred_perceptron_test
    best_model = perceptron_model
elif best_model_name == 'XGBoost':
    best_predictions = y_pred_xgb_test
    best_model = xgb_model
else:
    best_predictions = y_pred_cat_test
    best_model = catboost_model

In [None]:
# Save predictions and additional data needed for Solution 2
import pickle

# Prepare data for Solution 2
solution2_data = {
    'X_test': X_test,
    'y_test_actual': y_test,
    'y_test_predicted': best_predictions,
    'best_model_name': best_model_name,
    'best_model': best_model,
    'scaler': scaler,
    'label_encoders': label_encoders,
    'df_original': df,
    'test_indices': X_test.index
}

# Save to pickle file
with open('solution1_output.pkl', 'wb') as f:
    pickle.dump(solution2_data, f)

print("Data saved successfully for Solution 2!")
print(f"\nPredicted Creatinine values (first 10):")
print(best_predictions[:10])

## 8. Summary and Conclusion

In [None]:
print("=" * 70)
print("SOLUTION 1: CREATININE PREDICTION - SUMMARY")
print("=" * 70)
print(f"\nDataset Size: {len(df)} samples")
print(f"Features Used: {len(feature_columns)} features")
print(f"  - Numeric: Age, Albumin, Urine pH")
print(f"  - Categorical (encoded): RBC, Pus Cell, Bacteria")
print(f"\nTarget Variable: Serum Creatinine (mg/dL)")
print(f"\nTrain/Test Split: 80/20")
print(f"Training Samples: {len(X_train)}")
print(f"Test Samples: {len(X_test)}")
print("\n" + "=" * 70)
print("MODEL PERFORMANCE SUMMARY")
print("=" * 70)
print(results.to_string(index=False))
print("\n" + "=" * 70)
print(f"BEST MODEL: {best_model_name}")
print("=" * 70)
print(f"Test R² Score: {results.loc[best_model_idx, 'Test_R2']:.4f}")
print(f"Test RMSE: {results.loc[best_model_idx, 'Test_RMSE']:.4f}")
print("\nPredictions saved for Solution 2 (CKD Classification)")
print("=" * 70)