In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [None]:
data = pd.read_csv("/content/cirrhosis.csv")

In [None]:
# Numerical and categorical columns to fill
num_cols = ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
cat_cols = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders']

In [None]:
# Fill missing values
for col in num_cols:
    data[col].fillna(data[col].mean(), inplace=True)
for col in cat_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [None]:
# One-hot encode categorical columns
data_encoded = pd.get_dummies(data, columns=['Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Sex', 'Edema'])

In [None]:
# Separate features and target
X = data_encoded.drop('Status', axis=1)
y = data_encoded['Status']

In [None]:
# ❗ Drop ID column if present
if 'ID' in X.columns:
    X = X.drop('ID', axis=1)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=23)

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)

In [None]:
print("✅ RF Accuracy on Test:", accuracy_score(y_test, rf.predict(x_test)) * 100)
print("✅ RF Report:\n", classification_report(y_test, rf.predict(x_test)))

✅ RF Accuracy on Test: 73.80952380952381
✅ RF Report:
               precision    recall  f1-score   support

           C       0.76      0.86      0.81        66
          CL       0.00      0.00      0.00         9
           D       0.71      0.71      0.71        51

    accuracy                           0.74       126
   macro avg       0.49      0.52      0.50       126
weighted avg       0.68      0.74      0.71       126



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Save RF model
with open("rf.pkl", "wb") as f:
    pickle.dump(rf, f)

# Save scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [None]:
# ===================== LOAD NEW DATA FOR PREDICTION =====================
pred_data = pd.read_csv("/content/sample_test3.csv")

In [None]:
# Fill missing values
for col in num_cols:
    pred_data[col].fillna(pred_data[col].mean(), inplace=True)
for col in cat_cols:
    pred_data[col].fillna(pred_data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  pred_data[col].fillna(pred_data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  pred_data[col].fillna(pred_data[col].mode()[0], inplace=True)


In [None]:
# One-hot encoding
pred_data_encoded = pd.get_dummies(pred_data, columns=['Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Sex', 'Edema'])

In [None]:
# ❗ Drop ID column if present
if 'ID' in pred_data_encoded.columns:
    pred_data_encoded = pred_data_encoded.drop('ID', axis=1)

In [None]:
# Align prediction data to training features
pred_data_encoded = pred_data_encoded.reindex(columns=X.columns, fill_value=0)


In [None]:
# Load scaler and transform input
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)
pred_scaled = scaler.transform(pred_data_encoded)

In [None]:
# Save feature names used in training
with open("model_columns.pkl", "wb") as f:
    pickle.dump(X.columns.tolist(), f)


In [None]:
# Save model feature column names
with open("model_columns.pkl", "wb") as f:
    pickle.dump(X.columns.tolist(), f)


In [None]:
# Predict using Random Forest
with open("rf.pkl", "rb") as f:
    rf_model = pickle.load(f)
rf_preds = rf_model.predict(pred_scaled)
print("✅ Random Forest Prediction on New Data:", rf_preds)

✅ Random Forest Prediction on New Data: ['D' 'C' 'D' 'D' 'CL' 'D' 'C' 'D' 'D' 'D' 'D' 'D' 'C' 'D' 'C' 'C' 'D' 'D'
 'C' 'D' 'C' 'D' 'D' 'D' 'C' 'D' 'D' 'D' 'C' 'D' 'D' 'C' 'D' 'C' 'C' 'C'
 'D' 'D' 'C' 'C' 'D' 'C' 'C' 'D' 'C' 'D' 'C' 'C' 'D' 'D' 'C' 'D' 'D' 'D'
 'D' 'D' 'D' 'C' 'C' 'C' 'C' 'D' 'D' 'D' 'C' 'D' 'C' 'C' 'D' 'C' 'C' 'C'
 'C' 'D' 'D' 'D' 'D' 'D' 'C' 'D' 'D' 'D' 'D' 'C' 'C' 'D' 'D' 'C' 'D' 'D'
 'D' 'D' 'C' 'D' 'D' 'C' 'D' 'C' 'C' 'D' 'C' 'C' 'D' 'D' 'CL' 'D' 'C' 'C'
 'C' 'D' 'D' 'D' 'D' 'D' 'C' 'D' 'D' 'C' 'D' 'CL' 'D' 'C' 'D' 'C' 'CL' 'D'
 'C' 'D' 'C' 'D' 'D' 'C' 'D' 'C' 'C' 'C' 'C' 'D' 'C' 'C' 'C' 'D' 'D' 'D'
 'C' 'C' 'C' 'D' 'D' 'C' 'C' 'D' 'C' 'D' 'C' 'D' 'C' 'CL' 'C' 'C' 'C' 'D'
 'D' 'D' 'D' 'C' 'D' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'D'
 'C' 'C' 'C' 'D' 'C' 'D' 'D' 'C' 'C' 'C' 'D' 'C' 'D' 'C' 'C' 'C' 'C' 'C'
 'C' 'C' 'C' 'C' 'C' 'D' 'D' 'C' 'C' 'D' 'C' 'C' 'C' 'C' 'C' 'C' 'D' 'C'
 'D' 'C' 'C' 'D' 'C' 'D' 'D' 'C' 'C' 'C' 'C' 'C' 'D' 'C' 'D' 'C' 'C' 'C'
 'C' '

In [None]:
# Save predictions to CSV
pred_data['RF_Predicted_Status'] = rf_preds
pred_data.to_csv("predicted_output.csv", index=False)
print("\n📁 Predictions saved to predicted_output.csv")


📁 Predictions saved to predicted_output.csv
