<a href="https://colab.research.google.com/github/Rakesh537-ai/WEBdev/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# -------------------------------
# 0️⃣ Imports
# -------------------------------
import io
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
from google.colab import files
import xgboost as xgb

# -------------------------------
# 1️⃣ Upload TrainA and TrainB
# -------------------------------
print("📂 Upload TrainA (.psv) file")
uploadedA = files.upload()

print("📂 Upload TrainB (.psv) file")
uploadedB = files.upload()

# Read uploaded files (pipe-separated)
dfA = pd.read_csv(io.BytesIO(list(uploadedA.values())[0]), sep='|')
dfB = pd.read_csv(io.BytesIO(list(uploadedB.values())[0]), sep='|')

print("✅ Files uploaded successfully!")
print("TrainA shape:", dfA.shape)
print("TrainB shape:", dfB.shape)

# -------------------------------
# 2️⃣ Preprocessing
# -------------------------------
dfA.columns = dfA.columns.str.strip()
dfB.columns = dfB.columns.str.strip()

target_col = 'SepsisLabel'

# Combine dataframes for consistent preprocessing
combined_df = pd.concat([dfA.drop(columns=[target_col]), dfB.drop(columns=[target_col])], ignore_index=True)

# Drop columns with >85% missing
threshold = 0.85
cols_to_drop = combined_df.columns[combined_df.isnull().mean() > threshold]
dfA = dfA.drop(columns=cols_to_drop)
dfB = dfB.drop(columns=cols_to_drop)

# Separate features and target
X_A = dfA.drop(columns=[target_col])
y_A = dfA[target_col]
X_B = dfB.drop(columns=[target_col])
y_B = dfB[target_col]

# Impute missing values
imputer = SimpleImputer(strategy='median')
imputer.fit(pd.concat([X_A, X_B], ignore_index=True))
X_A_imputed = pd.DataFrame(imputer.transform(X_A), columns=X_A.columns)
X_B_imputed = pd.DataFrame(imputer.transform(X_B), columns=X_B.columns)

# Scale features
scaler = StandardScaler()
scaler.fit(pd.concat([X_A_imputed, X_B_imputed], ignore_index=True))
X_A_scaled = pd.DataFrame(scaler.transform(X_A_imputed), columns=X_A_imputed.columns)
X_B_scaled = pd.DataFrame(scaler.transform(X_B_imputed), columns=X_B_imputed.columns)

# -------------------------------
# 3️⃣ Handle imbalance + Train-test split
# -------------------------------
print("\nClass distribution in TrainA before balancing:")
print(y_A.value_counts())

if y_A.nunique() < 2 or y_A.value_counts().min() < 2:
    raise ValueError("❌ Your dataset has only one class (no sepsis cases). Please check TrainA file.")
else:
    minority = dfA[dfA[target_col] == 1]
    majority = dfA[dfA[target_col] == 0]

    # Ensure at least some balance (keep 1:5 ratio if possible)
    if len(minority) > 0:
        majority_sample = majority.sample(
            n=min(len(majority), len(minority) * 5),
            random_state=42
        )
        balanced_df = pd.concat([minority, majority_sample], ignore_index=True)
        print(f"\nAfter balancing: {balanced_df[target_col].value_counts()}")
    else:
        balanced_df = dfA  # fallback

# Separate balanced data
X_bal = balanced_df.drop(columns=[target_col])
y_bal = balanced_df[target_col]

# Impute + scale using same transformers
X_bal_imputed = pd.DataFrame(imputer.transform(X_bal), columns=X_bal.columns)
X_bal_scaled = pd.DataFrame(scaler.transform(X_bal_imputed), columns=X_bal.columns)

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X_bal_scaled, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)

print("\n✅ Training set class distribution:")
print(y_train.value_counts(normalize=True))
print("\n✅ Test set class distribution:")
print(y_test.value_counts(normalize=True))

# -------------------------------
# 4️⃣ Define Models
# -------------------------------
log_clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
rf_clf = RandomForestClassifier(
    n_estimators=200, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1
)
xgb_clf = xgb.XGBClassifier(
    n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,
    colsample_bytree=0.8, eval_metric='logloss', random_state=42
)
svm_clf = SVC(probability=True, kernel='rbf', class_weight='balanced', random_state=42)

# -------------------------------
# 5️⃣ Voting Classifier
# -------------------------------
voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('rf', rf_clf),
        ('xgb', xgb_clf),
        ('svm', svm_clf)
    ],
    voting='soft',
    n_jobs=-1
)

# -------------------------------
# 6️⃣ Train
# -------------------------------
print("\n🚀 Training ensemble model...")
voting_clf.fit(X_train, y_train)
print("✅ Training complete!")

# -------------------------------
# 7️⃣ Evaluate on TrainB
# -------------------------------
print("\n📊 Evaluating on TrainB dataset...")
y_pred = voting_clf.predict(X_B_scaled)
y_prob = voting_clf.predict_proba(X_B_scaled)[:, 1]

print("\nAccuracy:", accuracy_score(y_B, y_pred))
print("ROC-AUC:", roc_auc_score(y_B, y_prob))
print("\nClassification Report:\n", classification_report(y_B, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_B, y_pred))

# -------------------------------
# 8️⃣ Feature Importance (Random Forest)
# -------------------------------
# Fit the RandomForestClassifier separately to access feature_importances_
rf_clf.fit(X_train, y_train)
feat_imp = pd.Series(rf_clf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(8, 6))
feat_imp.head(15).plot(kind='barh', color='steelblue')
plt.title("Top 15 Important Features (Random Forest)")
plt.gca().invert_yaxis()
plt.show()

📂 Upload TrainA (.psv) file


Saving p000010.psv to p000010 (1).psv
📂 Upload TrainB (.psv) file


Saving p000009.psv to p000009 (1).psv
✅ Files uploaded successfully!
TrainA shape: (23, 41)
TrainB shape: (258, 41)

Class distribution in TrainA before balancing:
SepsisLabel
0    23
Name: count, dtype: int64


ValueError: ❌ Your dataset has only one class (no sepsis cases). Please check TrainA file.