# Data science in Microsoft Fabric

In [1]:
# Azure storage access info for open dataset diabetes
blob_account_name = "azureopendatastorage"
blob_container_name = "mlsamples"
blob_relative_path = "diabetes"
blob_sas_token = r"" # Blank since container is Anonymous access
    
# Set Spark config to access  blob storage
wasbs_path = f"wasbs://%s@%s.blob.core.windows.net/%s" % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set("fs.azure.sas.%s.%s.blob.core.windows.net" % (blob_container_name, blob_account_name), blob_sas_token)
print("Remote blob path: " + wasbs_path)
    
# Spark read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 3, Finished, Available)

Remote blob path: wasbs://mlsamples@azureopendatastorage.blob.core.windows.net/diabetes


In [2]:
display(df)

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 10, Finished, Available)

SynapseWidget(Synapse.DataFrame, 08b4cb9e-3f23-405c-8f87-84f6de481de3)

In [4]:
# Code generated by Data Wrangler for pandas DataFrame

def clean_data(df):
    # Created column 'Risk' from formula
    df['Risk'] = (df['Y'] > 211.5).astype(int)
    return df

df_clean = clean_data(df.copy())
df_clean.head()

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 32, Finished, Available)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y,Risk
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151,0
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75,0
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141,0
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206,0
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135,0


In [3]:
df = df.toPandas()
df.head()

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 5, Finished, Available)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [5]:
from sklearn.model_selection import train_test_split
    
X, y = df_clean[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df_clean['Y'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 33, Finished, Available)

In [6]:
import mlflow
experiment_name = "diabetes-regression"
mlflow.set_experiment(experiment_name)

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 34, Finished, Available)

2024/01/09 18:42:49 INFO mlflow.tracking.fluent: Experiment with name 'diabetes-regression' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1704825773066, experiment_id='7e4348dd-c6ce-459d-9cdf-c95efb471112', last_update_time=None, lifecycle_stage='active', name='diabetes-regression', tags={}>

In [7]:
from sklearn.linear_model import LinearRegression
    
with mlflow.start_run():
   mlflow.autolog()
    
   model = LinearRegression()
   model.fit(X_train, y_train)

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 35, Finished, Available)

2024/01/09 18:43:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [8]:
from sklearn.model_selection import train_test_split
    
X, y = df_clean[['AGE','SEX','BMI','BP','S1','S2','S3','S4','S5','S6']].values, df_clean['Risk'].values
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 36, Finished, Available)

In [9]:
import mlflow
experiment_name = "diabetes-classification"
mlflow.set_experiment(experiment_name)

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 37, Finished, Available)

2024/01/09 18:44:21 INFO mlflow.tracking.fluent: Experiment with name 'diabetes-classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1704825861988, experiment_id='2d17e6d2-e194-4cd3-8acb-44c53e292a74', last_update_time=None, lifecycle_stage='active', name='diabetes-classification', tags={}>

In [10]:
from sklearn.linear_model import LogisticRegression
    
with mlflow.start_run():
    mlflow.sklearn.autolog()

    model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)

StatementMeta(, a7904bab-db26-4858-a951-a055d8cb028d, 38, Finished, Available)

