In [None]:
# Livestock Disease Prognosis - Cleaned Python Script with SHAP
# Consolidated from recurrent-free Jupyter notebook
# Run this in a Python environment with required libs: pandas, sklearn, boruta, catboost, xgboost, lime, shap
# Assumes 'Data.csv' in same directory

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
from boruta import BorutaPy
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import numpy as np
from lime.lime_tabular import LimeTabularExplainer
import shap  # Added for SHAP

# =============================================================================
# Data Loading & Preprocessing
# =============================================================================
with open("Data.csv", "r") as f:
    lines = f.readlines()
lines = [line.strip().replace('"', '') for line in lines]
data = [line.split(',') for line in lines]

df = pd.DataFrame(data[1:], columns=data[0])  
df.columns = df.columns.str.strip()

print(df.columns.tolist())

X = df.drop("Disease", axis=1)
y = df["Disease"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)
numeric_features = ['Age', 'Temperature']
categorical_features = ['Animal']
binary_features = [col for col in X.columns if col not in numeric_features + categorical_features]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(), categorical_features)
], remainder='passthrough')

baseline_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("rf", RandomForestClassifier(random_state=42))
])

baseline_pipeline.fit(X, y_encoded)

onehot_cols = baseline_pipeline.named_steps['preprocess'].named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = numeric_features + list(onehot_cols) + binary_features

importances = baseline_pipeline.named_steps['rf'].feature_importances_

feat_imp = pd.Series(importances, index=all_features).sort_values(ascending=False)
top_20_features = feat_imp.head(20).index.tolist()
print("Top 20 features:", top_20_features)

X_preprocessed = baseline_pipeline.named_steps['preprocess'].transform(X)
X_pre_df = pd.DataFrame(X_preprocessed, columns=all_features)
X_top20 = X_pre_df[top_20_features]
X_top20_cleaned = X_top20.apply(pd.to_numeric, errors='coerce').fillna(0)
print(X_top20_cleaned.dtypes)
print("Missing values:\n", X_top20_cleaned.isnull().sum())

# =============================================================================
# Boruta Feature Selection
# =============================================================================
X_np = X_pre_df.values
y_np = y_encoded

rf_boruta = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5, random_state=42)
boruta_selector = BorutaPy(rf_boruta, n_estimators='auto', verbose=2, random_state=42)
boruta_selector.fit(X_np, y_np)

boruta_features = X_pre_df.columns[boruta_selector.support_].to_list()
print("Boruta selected features:", boruta_features)

X_final = X_pre_df[boruta_features].apply(pd.to_numeric, errors='coerce').fillna(0)

# Single Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_encoded, test_size=0.2, random_state=42)

# Unique Animals
unique_animals = df['Animal'].unique()
print(unique_animals)

# =============================================================================
# Model Tuning & Evaluation
# =============================================================================
# Tuned RF
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)
rf_tuned = grid_rf.best_estimator_
print("Best RF parameters:", grid_rf.best_params_)

# Tuned XGBoost
params_xgb = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 150],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}
search_xgb = RandomizedSearchCV(
    XGBClassifier(objective='multi:softmax', num_class=len(set(y_encoded)), use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    param_distributions=params_xgb,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    random_state=42,
    n_jobs=-1
)
search_xgb.fit(X_train, y_train)
xgb_tuned = search_xgb.best_estimator_
print("Best XGB Accuracy:", search_xgb.best_score_)
print("Best XGB Params:", search_xgb.best_params_)

# Tuned CatBoost
cat_tuned = CatBoostClassifier(
    iterations=200,
    depth=6,
    learning_rate=0.1,
    l2_leaf_reg=3,
    verbose=0,
    random_state=42
)
cat_tuned.fit(X_train, y_train)

# Evaluate
models = {
    "Tuned Random Forest": rf_tuned,
    "Tuned XGBoost": xgb_tuned,
    "Tuned CatBoost": cat_tuned
}
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n=== {name} ===\nAccuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

# =============================================================================
# SHAP Analysis (Added: Global Feature Importance on Tuned RF)
# =============================================================================
explainer = shap.TreeExplainer(rf_tuned)
shap_values = explainer.shap_values(X_train)

# Handle multi-class (aggregate mean abs SHAP)
if isinstance(shap_values, list) and len(shap_values) == len(le.classes_):
    # Multi-class: sum abs over classes, mean over samples
    shap_vals = np.array([np.abs(sv).mean(axis=0) for sv in shap_values]).sum(axis=0)
else:
    # Binary/single: abs mean over samples
    shap_vals = np.abs(shap_values).mean(axis=0)

feature_importance_shap = pd.Series(shap_vals, index=X_train.columns).sort_values(ascending=False)
print("\nSHAP Feature Importance:\n", feature_importance_shap)

# =============================================================================
# Ensemble Voting Classifier
# =============================================================================
voting_clf = VotingClassifier(
    estimators=[('rf', rf_tuned), ('xgb', xgb_tuned), ('cat', cat_tuned)],
    voting='soft'
)
voting_clf.fit(X_train, y_train)
y_pred_ens = voting_clf.predict(X_test)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_ens))

# =============================================================================
# Prediction on New Data
# =============================================================================
model_features = X_final.columns
new_data = {
    'blisters on gums': [1],
    'blisters on hooves': [1],
    'blisters on mouth': [1],
    'blisters on tongue': [0],
    'chest discomfort': [0],
    'chills': [0],
    'crackling sound': [0],
    'depression': [0]
}
new_df = pd.DataFrame(0, index=[0], columns=model_features)
for symptom, value in new_data.items():
    if symptom in new_df.columns:
        new_df[symptom] = value[0]

predicted_disease = voting_clf.predict(new_df)
disease_label = le.classes_[predicted_disease[0]]
print(f"\nPredicted Disease: {disease_label}")

# =============================================================================
# LIME Explanation
# =============================================================================
X_train_array = X_train.values
class_names = list(le.classes_)

explainer = LimeTabularExplainer(
    training_data=X_train_array,
    feature_names=model_features,
    class_names=class_names,
    mode='classification'
)

exp = explainer.explain_instance(
    data_row=new_df.iloc[0].values,
    predict_fn=voting_clf.predict_proba,
    num_features=10
)

# Note: exp.show_in_notebook() is Jupyter-specific; in script, use exp.as_list() or save HTML
print(exp.as_list())  # Simple text output for script
# For full HTML: exp.save_to_file('lime_explanation.html')