# Training a Logistic Regression Model

## Imports

In [1]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_score,
    accuracy_score,
)
from sklearn.model_selection import train_test_split
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

from joblib import dump, load

## Find the git root Directory

In [2]:
# Get the current working directory
cwd = Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

## Import Utils

In [3]:
sys.path.append(f"{root_dir}/utils")
import preprocess_utils as ppu

# Seed and Generator for Reproducibility

In [4]:
rnd_val = 0  # Random value for all seeds
rng = np.random.default_rng(seed=rnd_val)  # random number generator

# Converting parquet to pandas dataframe

## Feature selected plate data

In [5]:
filename3 = "Plate_3_sc_norm_fs.parquet"
filename3p = "Plate_3_prime_sc_norm_fs.parquet"
plate_path = Path(
    f"{root_dir}/nf1_painting_repo/3.processing_features/data/feature_selected_data"
)

path3 = plate_path / filename3

path3p = plate_path / filename3p

# Creates an object for accessing the plate 3 normalized data using the path of the data
po3 = ppu.Preprocess_data(path=path3)

# Creates an object for accessing the plate 3 prime normalized data using the path of the data
po3p = ppu.Preprocess_data(path=path3p)

plate3df = po3.df  # Returns the dataframe generated by the csv
plate3pdf = po3p.df  # Returns the dataframe generated by the csv

common_columns = list(plate3df.columns.intersection(plate3pdf.columns))

## Annotated plate data

In [6]:
filename3 = "Plate_3_sc.parquet"
filename3p = "Plate_3_prime_sc.parquet"
plate_path = Path(
    f"{root_dir}/nf1_painting_repo/3.processing_features/data/annotated_data"
)

path3 = plate_path / filename3

path3p = plate_path / filename3p

# Creates an object for accessing the plate 3 normalized data using the path of the data
po3 = ppu.Preprocess_data(path=path3)

# Creates an object for accessing the plate 3 prime normalized data using the path of the data
po3p = ppu.Preprocess_data(path=path3p)

plate3df = po3.df  # Returns the dataframe generated by the csv
plate3pdf = po3p.df  # Returns the dataframe generated by the csv

# Preprocess Data

## Use only common columns

In [7]:
# Set plate column:
plate3df["Metadata_plate"] = "3"
plate3pdf["Metadata_plate"] = "3p"

plate3df = plate3df.loc[:, common_columns]
plate3pdf = plate3pdf.loc[:, common_columns]

# Combine the plate dataframes:
platedf = pd.concat([plate3df, plate3pdf], axis="rows")

## Normalize Data

In [8]:
# Get all columns that aren't metadata
columns_to_normalize = [col for col in platedf.columns if "Metadata" not in col]

# Normalize the columns
mms = MinMaxScaler()
normdf = pd.DataFrame(
    mms.fit_transform(platedf[columns_to_normalize]), columns=columns_to_normalize
)

# Apply the transformation to the dataframe
platedf[columns_to_normalize] = normdf

## Create Classes

In [9]:
target_column = "Metadata_genotype"
stratify_column = "Metadata_Well"

# These represent the fractions of the entire dataset
train_val_frac = 0.85
test_frac = 1 - train_val_frac
val_frac = 0.15

## Down-sample and stratify by well

In [10]:
smallest_gene = platedf[target_column].value_counts().min()
platedata = pd.DataFrame()

for gene in platedf[target_column].unique():
    df = platedf.loc[platedf["Metadata_genotype"] == gene]
    df_frac = smallest_gene / len(df)
    stratwell = df.groupby(stratify_column, group_keys=False).apply(
        lambda x: x.sample(frac=df_frac, random_state=rnd_val)
    )
    platedata = pd.concat([platedata, stratwell], axis="rows")

## Stratified Train-test split

In [11]:
traindf, testdf = train_test_split(
    platedata,
    train_size=train_val_frac,
    stratify=platedata[[target_column, stratify_column]],
    shuffle=True,
    random_state=rnd_val,
)

## Encode Labels

In [12]:
le = LabelEncoder()
testdf["label"] = le.fit_transform(testdf[target_column].values)
traindf["label"] = le.transform(traindf[target_column].values)

  testdf["label"] = le.fit_transform(testdf[target_column].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdf["label"] = le.fit_transform(testdf[target_column].values)
  traindf["label"] = le.transform(traindf[target_column].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  traindf["label"] = le.transform(traindf[target_column].values)


## Remove unecessary columns

In [13]:
traindf = po3.remove_meta(df=traindf)
testdf = po3.remove_meta(df=testdf)

# Model Training

In [14]:
lr = LogisticRegression(
    max_iter=1000, solver="sag", multi_class="ovr", random_state=rnd_val, n_jobs=-1
)
lr.fit(X=traindf.drop("label", axis="columns"), y=traindf["label"])

LogisticRegression(max_iter=1000, multi_class='ovr', n_jobs=-1, random_state=0,
                   solver='sag')

## Save Model

In [15]:
data_path = Path("data")

data_path.mkdir(
    parents=True, exist_ok=True
)  # Create the parent directories if they don't exist

dump(lr, data_path / "lr_model.joblib")

['data/lr_model.joblib']

## Save Data

In [16]:
dump(testdf, data_path / "testdf.joblib")
dump(le, data_path / "label_encoder.joblib")

['data/label_encoder.joblib']