# Train a Logistic Regression Model

## Imports

In [None]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_score,
    accuracy_score,
)
from sklearn.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import train_test_split
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from joblib import dump, load

sys.path.append("../utils")
import analysis_utils as au
import preprocess_utils as ppu
import eval_utils as eu
import train_utils as tu

# Seed and Generator for Reproducibility

In [None]:
rnd_val = 0  # Random value for all seeds
rng = np.random.default_rng(seed=rnd_val)  # random number generator

# Converting csv to pandas dataframe

In [None]:
filename3 = "Plate_3_sc_norm_fs.parquet"
filename3p = "Plate_3_prime_sc_norm_fs.parquet"

path3 = (
    Path("../nf1_painting_repo/3.processing_features/data/feature_selected_data")
    / filename3
)
path3p = (
    Path("../nf1_painting_repo/3.processing_features/data/feature_selected_data")
    / filename3p
)

po3 = ppu.Preprocess_data(path=path3)
po3p = ppu.Preprocess_data(path=path3p)

plate3df = po3.df  # Returns the dataframe generated by the csv
plate3pdf = po3p.df  # Returns the dataframe generated by the csv

# Preprocess Data

## Use only common columns

In [None]:
# Set plate column:
plate3df["Metadata_plate"] = "3"
plate3pdf["Metadata_plate"] = "3p"

common_columns = list(plate3df.columns.intersection(plate3pdf.columns))
plate3df = plate3df.loc[:, common_columns]
plate3pdf = plate3pdf.loc[:, common_columns]

# Combine the plate dataframes:
platedf = pd.concat([plate3df, plate3pdf], axis="rows")

## Create Classes

In [None]:
target_column = "Metadata_genotype"
stratify_column = "Metadata_Well"

# These represent the fractions of the entire dataset
train_val_frac = 0.85
test_frac = 1 - train_val_frac
val_frac = 0.15

## Down-sample and stratify by well

In [None]:
smallest_gene = platedf[target_column].value_counts().min()
platedata = pd.DataFrame()

for gene in platedf[target_column].unique():
    df = platedf.loc[platedf["Metadata_genotype"] == gene]
    df_frac = smallest_gene / len(df)
    stratwell = df.groupby(stratify_column, group_keys=False).apply(
        lambda x: x.sample(frac=df_frac, random_state=rnd_val)
    )
    platedata = pd.concat([platedata, stratwell], axis="rows")

## Stratified Train-test split

In [None]:
traindf, testdf = train_test_split(
    platedata,
    train_size=train_val_frac,
    stratify=platedata[[target_column, stratify_column]],
    shuffle=True,
    random_state=rnd_val,
)  # Create validation splits

## Encode Labels

In [None]:
le = LabelEncoder()
testdf["label"] = le.fit_transform(testdf[target_column].values)
traindf["label"] = le.transform(traindf[target_column].values)

## Remove unecessary columns

In [None]:
traindf = po3.remove_meta(df=traindf)
testdf = po3.remove_meta(df=testdf)

# Model Training

In [1]:
lr = LogisticRegression(
    max_iter=1000,
    solver="sag",
    multi_class="multinomial",
    random_state=rnd_val,
    n_jobs=-1,
)
lr.fit(X=traindf.drop("label", axis="columns"), y=traindf["label"])




LogisticRegression(max_iter=1000, multi_class='multinomial', n_jobs=-1,
                   random_state=0, solver='sag')

## Save Model

In [2]:
data_path = Path("trained_models_feature_selection/plate3_cp_fs_data")

data_path.mkdir(parents=True, exist_ok=True)

dump(lr, data_path / "lr_model.joblib")




['trained_models_feature_selection/plate3_cp_fs_data/lr_model.joblib']

## Save Data

In [3]:
dump(testdf, data_path / "testdf.joblib")
dump(le, data_path / "label_encoder.joblib")




['trained_models_feature_selection/plate3_cp_fs_data/label_encoder.joblib']