# MLFLOW setup in Jupyter Notebook

This notebook contains a small Data Science Project where MLFlow is used to log all Feature Engineering and Modeling Parameters as well as Metrics.



## Loading the Lib

In [5]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
import zipfile
from modeling.config import EXPERIMENT_NAME
from collections import defaultdict, Counter
from Bio import SeqIO
warnings.filterwarnings('ignore')


# Loading the data

In [None]:
# Train_texonomy Data Loading

train_taxonomy = pd.read_csv(
    "D:/data_science_dir/Capstone_project/protein_function_prediction/cafa-6-protein-function-prediction/train_taxonomy.tsv",
    sep="\t",
    header=None,
    names=["protein_id", "tax_id"]
)

train_taxonomy


Unnamed: 0,protein_id,tax_id
0,A0A0C5B5G6,9606
1,A0JNW5,9606
2,A0JP26,9606
3,A0PK11,9606
4,A1A4S6,9606
...,...,...
82399,Q9UTM1,284812
82400,Q9Y7I1,284812
82401,Q9Y7P7,284812
82402,Q9Y7Q3,284812


In [16]:
# train_terms Data Loading
train_terms = pd.read_csv(
    "D:/data_science_dir/Capstone_project/protein_function_prediction/cafa-6-protein-function-prediction/train_terms.tsv",
    sep="\t",
    header=0
)

train_terms

Unnamed: 0,EntryID,term,aspect
0,Q5W0B1,GO:0000785,C
1,Q5W0B1,GO:0004842,F
2,Q5W0B1,GO:0051865,P
3,Q5W0B1,GO:0006275,P
4,Q5W0B1,GO:0006513,P
...,...,...,...
537022,Q06667,GO:0070481,P
537023,B1NF19,GO:0033075,P
537024,B1NF19,GO:0047052,F
537025,B1NF19,GO:0047056,F


In [11]:
from Bio import SeqIO
import pandas as pd

# FASTA file path (use ONE path only)
fasta_path = "D:/data_science_dir/Capstone_project/protein_function_prediction/cafa-6-protein-function-prediction/train_sequences.fasta"

# Containers
ids = []
sequences = []
data = []

# Read FASTA once
for record in SeqIO.parse(fasta_path, "fasta"):
    protein_id = record.id
    sequence = str(record.seq)

    ids.append(protein_id)
    sequences.append(sequence)

    data.append({
        "protein_id": protein_id,
        "sequence": sequence,
        "length": len(sequence)
    })

print(f"{len(sequences)} sequences loaded")

# Convert to DataFrame
df_sequences = pd.DataFrame(data)
df_sequences.head(50)


82404 sequences loaded


Unnamed: 0,protein_id,sequence,length
0,sp|A0A0C5B5G6|MOTSC_HUMAN,MRWQEMGYIFYPRKLR,16
1,sp|A0JNW5|BLT3B_HUMAN,MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...,1464
2,sp|A0JP26|POTB3_HUMAN,MVAEVCSMPAASAVKKPFDLRSKMGKWCHHRFPCCRGSGKSNMGTS...,581
3,sp|A0PK11|CLRN2_HUMAN,MPGWFKKAWYGLASLLSFSSFILIIVALVVPHWLSGKILCQTGVDL...,232
4,sp|A1A4S6|RHG10_HUMAN,MGLQPLEFSDCYLDSPWFRERIRAHEAELERTNKFIKELIKDGKNL...,786
5,sp|A1A519|F170A_HUMAN,MKRRQKRKHLENEESQETAEKGGGMSKSQEDALQPGSTRVAKGWSQ...,330
6,sp|A1L190|SYCE3_HUMAN,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...,88
7,sp|A1L3X0|ELOV7_HUMAN,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,281
8,sp|A1X283|SPD2B_HUMAN,MPPRRSIVEVKVLDVQKRRVPNKHYVYIIRVTWSSGSTEAIYRRYS...,911
9,sp|A2A2Y4|FRMD3_HUMAN,MFASCHCVPRGRRTMKMIHFRSSSVKSLSQEMRCTIRLLDDSEISC...,597
