In [39]:
!pip install --upgrade numpy==2.0.0 scikit-learn==1.6.0 --force-reinstall

Collecting numpy==2.0.0
  Using cached numpy-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting scikit-learn==1.6.0
  Downloading scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn==1.6.0)
  Using cached scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
Collecting joblib>=1.2.0 (from scikit-learn==1.6.0)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn==1.6.0)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached numpy-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.0 MB)
Downloading scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached job

In [1]:
import numpy as np
import sklearn

print("NumPy:", np.__version__)
print("Sklearn:", sklearn.__version__)

NumPy: 2.0.0
Sklearn: 1.6.0


In [2]:
import os
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, roc_auc_score, roc_curve, classification_report)
import joblib
RANDOM_STATE = 42

In [3]:
from google.colab import files
print('If you have a local CSV, run the next line to upload it to Colab runtime:')
print('\nIf you want to download an example dataset (Titanic), run this cell:')
def load_example(name='titanic'):
    if name=='titanic':
        url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
        return pd.read_csv(url)
    elif name=='heart':
        url = 'https://raw.githubusercontent.com/amarbudhiraj/Heart-Disease-Prediction-using-Machine-Learning/master/heart.csv'
        return pd.read_csv(url)
    else:
        raise ValueError('example not found')

If you have a local CSV, run the next line to upload it to Colab runtime:

If you want to download an example dataset (Titanic), run this cell:


In [4]:
def quick_check(df):
    print('Rows, cols:', df.shape)
    display(df.head())
    display(df.describe(include='all').T)
    print('\nMissing values per column:')
    display(df.isnull().sum().sort_values(ascending=False).head(20))

In [5]:
def remove_duplicates(df):
    before = df.shape[0]
    df = df.drop_duplicates()
    after = df.shape[0]
    print(f'Removed {before-after} duplicates')
    return df
def fix_dtypes(df, convert_dict=None):
    if convert_dict is None:
        return df
    for c,t in convert_dict.items():
        try:
            if t=='datetime':
                df[c] = pd.to_datetime(df[c], errors='coerce')
            else:
                df[c] = df[c].astype(t)
            print(f'Converted {c} to {t}')
        except Exception as e:
            print(f'Could not convert {c}:', e)
    return df


def impute_missing(df, strategy_num='median', strategy_cat='most_frequent', drop_thresh=0.5):
    n = df.shape[0]
    drop_cols = [c for c in df.columns if df[c].isnull().sum()/n>drop_thresh]
    if drop_cols:
        print('Dropping high-missing columns:', drop_cols)
        df = df.drop(columns=drop_cols)


    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(exclude=[np.number, 'datetime']).columns.tolist()


    if num_cols:
        imputer_num = SimpleImputer(strategy=strategy_num)
        df[num_cols] = imputer_num.fit_transform(df[num_cols])
    if cat_cols:
        imputer_cat = SimpleImputer(strategy=strategy_cat, fill_value='Missing')
        df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])


    return df


def cap_outliers_iqr(df, cols=None):
    if cols is None:
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for c in cols:
        q1 = df[c].quantile(0.25)
        q3 = df[c].quantile(0.75)
        iqr = q3 - q1
        low = q1 - 1.5*iqr
        high = q3 + 1.5*iqr
        df[c] = np.where(df[c] < low, low, df[c])
        df[c] = np.where(df[c] > high, high, df[c])
    print('Capped outliers using IQR for', cols)
    return df

In [6]:
def eda_plots(df, target=None, figsize=(12,8)):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(exclude=[np.number, 'datetime']).columns.tolist()


    # 1. Histograms for numeric
    df[num_cols].hist(bins=20, figsize=(14,10))
    plt.tight_layout(); plt.show()


    # 2. Heatmap (correlation)
    plt.figure(figsize=(10,8))
    sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation matrix (numeric)')
    plt.show()


    # 3. Boxplots for numeric to see outliers
    for c in num_cols:
        plt.figure(figsize=(6,2))
        sns.boxplot(x=df[c])
        plt.title(f'Boxplot - {c}')
        plt.show()


    # 4. If target given, show relationship plots
    if target and target in df.columns:
        for c in num_cols:
            if c==target: continue
            plt.figure(figsize=(6,4))
            sns.scatterplot(data=df, x=c, y=target)
            plt.title(f'{c} vs {target}')
            plt.show()


        for c in cat_cols[:6]:
            plt.figure(figsize=(8,4))
            sns.countplot(data=df, x=c, hue=target)
            plt.title(f'{c} counts by {target}')
            plt.xticks(rotation=45)
            plt.show()


def basic_insights(df, target=None, top_n=10):
    insights = []
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target and target in df.columns:
        corr = df[num_cols].corr()[target].drop(target).abs().sort_values(ascending=False)
        insights.append('Top numeric correlates with target: ' + ', '.join(corr.head(5).index.tolist()))
    nulls = df.isnull().sum()
    if nulls.sum()>0:
        insights.append('Columns with missing values: ' + ', '.join(nulls[nulls>0].index.tolist()))
    insights.append('Number of rows and columns: ' + str(df.shape))
    return insights

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
def create_feature_pipeline(df, categorical_thresh=10, target=None):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(exclude=[np.number, 'datetime']).columns.tolist()
    if target and target in num_cols:
        num_cols.remove(target)
    if target and target in cat_cols:
        cat_cols.remove(target)
    ohe_cols = [c for c in cat_cols if df[c].nunique()<=categorical_thresh]
    le_cols = [c for c in cat_cols if df[c].nunique()>categorical_thresh]
    num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                    ('scaler', StandardScaler())])
    ohe_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    le_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, num_cols),
        ('ohe', ohe_transformer, ohe_cols),
        ('le', le_transformer, le_cols)
    ], remainder='drop')

    return preprocessor, num_cols, ohe_cols, le_cols

In [8]:
def train_models(X_train, y_train):
    models = {}
    lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    lr.fit(X_train, y_train)
    models['LogisticRegression'] = lr
    rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    rf.fit(X_train, y_train)
    models['RandomForest'] = rf

    return models

In [9]:
def evaluate_model(model, X_test, y_test, model_name='model'):
    y_pred = model.predict(X_test)
    y_proba = None
    try:
        y_proba = model.predict_proba(X_test)[:,1]
    except Exception:
        pass


    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba) if y_proba is not None else None


    print(f'--- {model_name} ---')
    print('Accuracy:', acc)
    print('Precision:', prec)
    print('Recall:', rec)
    print('F1:', f1)
    if roc is not None:
        print('ROC-AUC:', roc)
    print('Confusion matrix:\n', cm)
    print('\nClassification report:\n', classification_report(y_test, y_pred))


    if y_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        plt.figure(); plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--'); plt.title(f'ROC - {model_name}'); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.show()


    return {'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'roc_auc':roc, 'confusion_matrix':cm}

In [10]:
def run_full_pipeline(df, target, test_size=0.3):
    assert target in df.columns, 'Target column not found'


    df_clean = remove_duplicates(df.copy())
    df_clean = impute_missing(df_clean)
    df_clean = cap_outliers_iqr(df_clean)

    eda_plots(df_clean, target=target)
    print('\nBasic insights:\n', '\n'.join(basic_insights(df_clean, target=target)))

    preprocessor, num_cols, ohe_cols, le_cols = create_feature_pipeline(df_clean, categorical_thresh=7, target=target)


    X = df_clean.drop(columns=[target])
    y = df_clean[target]

    if y.dtype=='object':
        le = LabelEncoder()
        y = le.fit_transform(y)

    X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=RANDOM_STATE, stratify=y if len(np.unique(y))>1 else None)


    X_train = preprocessor.fit_transform(X_train_raw)
    X_test = preprocessor.transform(X_test_raw)

    models = train_models(X_train, y_train)

    results = {}
    for name, m in models.items():
        results[name] = evaluate_model(m, X_test, y_test, model_name=name)

    best_name = max(results.keys(), key=lambda k: results[k]['f1'])
    best_model = models[best_name]
    joblib.dump({'model':best_model, 'preprocessor':preprocessor}, 'model_pipeline.pkl')
    print('Saved best model:', best_name)


    return {'models':models, 'results':results, 'best':best_name}

In [11]:
fastapi_snippet = '''
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import pandas as pd


app = FastAPI()
model_bundle = joblib.load('model_pipeline.pkl')
model = model_bundle['model']
preprocessor = model_bundle['preprocessor']


class InputSchema(BaseModel):
    # Update these fields to match your features, example:
    Pclass: int
    Sex: str
    Age: float
    SibSp: int
    Parch: int
    Fare: float
    Embarked: str


@app.post('/predict')
def predict(payload: InputSchema):
    data = pd.DataFrame([payload.dict()])
    X = preprocessor.transform(data)
    y_proba = model.predict_proba(X)[:,1]
    y = model.predict(X)
    return {'prediction': int(y[0]), 'probability': float(y_proba[0])}
'''

with open('fastapi_app_snippet.py','w') as f:
    f.write(fastapi_snippet)


print('FastAPI snippet saved as fastapi_app_snippet.py')

FastAPI snippet saved as fastapi_app_snippet.py


In [12]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load dataset
df = pd.read_csv("heart.csv")  # upload your file OR load via URL

X = df.drop("target", axis=1)
y = df["target"]

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier())
])

pipe.fit(X, y)

# Save model
joblib.dump(pipe, "model_pipeline.pkl")

print("model_pipeline.pkl saved")


model_pipeline.pkl saved


In [14]:
from google.colab import files
files.download("model_pipeline.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>