In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data.csv')

In [None]:
print(df.head())
# print(df.info())
# print(df.describe())

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [None]:
#Dropping columns that are not needed
df = df.drop(columns=['id', 'Unnamed: 32'])

#Map the target to binary values: 'M' to 1 (malignant), 'B' to 0 (benign)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Separate features and target datasets
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=102)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

##Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#train the model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

#Predict and evaluate the model
y_pred = model.predict(X_test)
print("Logistic Regression:")
print(classification_report(y_test, y_pred))

Logistic Regression:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



#Decision Tree Classifier

In [None]:

from sklearn.tree import DecisionTreeClassifier

# Initialize and train the model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Decision Tree Classifier:")
print(classification_report(y_test, y_pred))

Decision Tree Classifier:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        71
           1       0.93      0.93      0.93        43

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



##SupportVectorClassifier

In [None]:
from sklearn.svm import SVC

model = SVC()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Support Vector Machine (SVM):")
print(classification_report(y_test, y_pred))


Support Vector Machine (SVM):
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 17 21:30:22 2024

@author: retro
"""

import pandas as pd

# Load the dataset
df = pd.read_csv("data.csv")
df.head()


#Dropping columns that are not needed
df = df.drop(columns=['id', 'Unnamed: 32'])

#Map the target to binary values: 'M' to 1 (malignant), 'B' to 0 (benign)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Separate features and target datasets
X = df.drop(columns=['diagnosis'])
y = df['diagnosis']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=200)


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Define the model hyperparameters
params = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 8888,
}


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#train the model
model = LogisticRegression(**params)
model.fit(X_train, y_train)

#Predict and evaluate the model
y_pred = model.predict(X_test)
print("Logistic Regression:")
print(classification_report(y_test, y_pred))

class_report = classification_report(y_test, y_pred,output_dict= True)

#create a metrics which we want to log

import mlflow

mlflow.set_experiment("cancer_data")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")

with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy': class_report['accuracy'],
        'recall_class_0': class_report['0']['recall'],
        'recall_class_1': class_report['1']['recall'],
        'f1_score': class_report['macro avg']['f1-score']
        })
    mlflow.sklearn.log_model(model, "Logistic Regression")




NameError: name 's' is not defined

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Define a function to handle model training, evaluation, and logging
def train_evaluate_log_model(model, model_name, X_train, X_test, y_train, y_test, params):
    # Train the model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    print(f"{model_name}:")
    print(classification_report(y_test, y_pred))

    # Generate classification report as a dictionary
    class_report = classification_report(y_test, y_pred, output_dict=True)

    # Log with MLflow
    with mlflow.start_run():
        mlflow.log_params(params)
        mlflow.log_metrics({
            'accuracy': class_report['accuracy'],
            'recall_class_0': class_report['0']['recall'],
            'recall_class_1': class_report['1']['recall'],
            'f1_score': class_report['macro avg']['f1-score']
        })
        mlflow.sklearn.log_model(model, model_name)
        print(f"Model {model_name} logged successfully.\n")

# Load and preprocess data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=102)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Set up MLflow experiment
mlflow.set_experiment("cancer_data")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")

# Logistic Regression
logistic_params = {
    "solver": "lbfgs",
    "max_iter": 10000,
    "multi_class": "auto",
    "random_state": 8888,
}
logistic_model = LogisticRegression(**logistic_params)
train_evaluate_log_model(logistic_model, "Logistic Regression", X_train, X_test, y_train, y_test, logistic_params)

# Decision Tree Classifier
dt_params = {
    "random_state": 8888,
}
dt_model = DecisionTreeClassifier(**dt_params)
train_evaluate_log_model(dt_model, "Decision Tree Classifier", X_train, X_test, y_train, y_test, dt_params)

# You can add other models like SVM, RandomForest, and XGBoost similarly:
# Example:
from sklearn.ensemble import RandomForestClassifier
rf_params = {"n_estimators": 100, "random_state": 8888}
rf_model = RandomForestClassifier(**rf_params)
train_evaluate_log_model(rf_model, "Random Forest Classifier", X_train, X_test, y_train, y_test, rf_params)

# from xgboost import XGBClassifier
# xgb_params = {"random_state": 8888}
# xgb_model = XGBClassifier(**xgb_params)
# train_evaluate_log_model(xgb_model, "XGBoost Classifier", X_train, X_test, y_train, y_test, xgb_params)


In [None]:
# suppose you are a ML engineer

In [None]:
# I am also experiomenting with the preprocessing steps -->

# now--> ML engineer--> so that they can also log the preprocessing steps



In [None]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Function to log preprocessing steps
def log_preprocessing(preprocessing_steps):
    mlflow.log_dict(preprocessing_steps, "preprocessing_steps.json")

# Define a function to handle model training, evaluation, and logging
def train_evaluate_log_model(model, model_name, X_train, X_test, y_train, y_test, params, preprocessing_steps):
    # Log preprocessing steps
    log_preprocessing(preprocessing_steps)

    # Train the model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    print(f"{model_name}:")
    print(classification_report(y_test, y_pred))

    # Generate classification report as a dictionary
    class_report = classification_report(y_test, y_pred, output_dict=True)

    # Log with MLflow
    with mlflow.start_run():
        mlflow.log_params(params)
        mlflow.log_metrics({
            'accuracy': class_report['accuracy'],
            'recall_class_0': class_report['0']['recall'],
            'recall_class_1': class_report['1']['recall'],
            'f1_score': class_report['macro avg']['f1-score']
        })
        mlflow.sklearn.log_model(model, model_name)
        print(f"Model {model_name} logged successfully.\n")

# Load and preprocess data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=102)

# Preprocessing steps
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Record preprocessing steps
preprocessing_steps = {
    "scaling": {
        "method": "StandardScaler",
        "mean": scaler.mean_.tolist(),  # mean used for scaling
        "var": scaler.var_.tolist(),    # variance used for scaling
    },
    "train_test_split": {
        "test_size": 0.2,
        "random_state": 102
    }
}

# Set up MLflow experiment
mlflow.set_experiment("cancer_data")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")

# Logistic Regression
logistic_params = {
    "solver": "lbfgs",
    "max_iter": 10000,
    "multi_class": "auto",
    "random_state": 8888,
}
logistic_model = LogisticRegression(**logistic_params)
train_evaluate_log_model(logistic_model, "Logistic Regression", X_train, X_test, y_train, y_test, logistic_params, preprocessing_steps)

# Decision Tree Classifier
dt_params = {
    "random_state": 8888,
}
dt_model = DecisionTreeClassifier(**dt_params)
train_evaluate_log_model(dt_model, "Decision Tree Classifier", X_train, X_test, y_train, y_test, dt_params, preprocessing_steps)

# You can add other models like SVM, RandomForest, and XGBoost similarly:
# Example:
# from sklearn.ensemble import RandomForestClassifier
# rf_params = {"n_estimators": 100, "random_st
