In [2]:
import pandas as pd
import numpy as np 
from scipy import stats
import polars as pl
import sklearn.decomposition as decomposition
import sklearn.preprocessing as preprocessing
import os

In [3]:
neuro = pd.read_excel("../Data Integration/IMMUNO_vs_NEURO_df.xlsx", sheet_name="neuro")
whole_blood = pd.read_excel("../Data Integration/IMMUNO_vs_NEURO_df.xlsx", sheet_name="whole blood")
myeloids = pd.read_excel("../Data Integration/IMMUNO_vs_NEURO_df.xlsx", sheet_name="Myeloids")
lymphocytes = pd.read_excel("../Data Integration/IMMUNO_vs_NEURO_df.xlsx", sheet_name="Lymphocytes")
lymph_selectedpop = pd.read_excel("../Data Integration/IMMUNO_vs_NEURO_df.xlsx", sheet_name="Lymph_selectedpop")
behaviour = pd.read_excel("../Data Integration/IMMUNO_vs_NEURO_df.xlsx", sheet_name="partial_behaviour")

sheets = [neuro, whole_blood, myeloids, lymph_selectedpop, lymphocytes, behaviour]

for sheet in sheets:
    sheet["mouse_barcode"] = sheet["Batch"].astype(str) + "_" + sheet["Ms#"].astype(str)

metadata_cols = ["Batch", "Ms#", "Lab", "Strain", "Sex", "Condition", "Date"]

for sheet in sheets:
    existing_metadata_cols = [col for col in metadata_cols if col in sheet.columns]
    sheet[existing_metadata_cols] = sheet[existing_metadata_cols].astype(str)  # Convert existing metadata columns to string type
    sheet.drop_duplicates(subset=["mouse_barcode"] + existing_metadata_cols, inplace=True)  # Remove duplicate metadata rows

dated_sheets = [whole_blood, lymphocytes, myeloids]
dated_sheets = [whole_blood, lymphocytes, myeloids]

for i in range(len(dated_sheets)):
    dated_sheets[i] = dated_sheets[i].pivot_table(index="mouse_barcode", columns="Date").reset_index()
    dated_sheets[i].columns = [str(col[0]) + "_wk" + str(col[1]) if col[0] != "mouse_barcode" else col[0] for col in dated_sheets[i].columns]

whole_blood = dated_sheets[0].reset_index(drop=True)
lymphocytes = dated_sheets[1].reset_index(drop=True)
myeloids = dated_sheets[2].reset_index(drop=True)

metadata_dfs = []
for sheet in sheets:
    existing_metadata_cols = [col for col in metadata_cols if col in sheet.columns]
    metadata_dfs.append(sheet[["mouse_barcode"] + existing_metadata_cols])

metadata_df = pd.concat(metadata_dfs)

# Handle missing values in all metadata columns
for col in metadata_cols:
    if col in metadata_df.columns:
        metadata_df[col] = metadata_df[col].fillna(method="ffill")

metadata_df = metadata_df.drop_duplicates(subset="mouse_barcode").reset_index(drop=True)

merged_df = pd.concat([neuro.set_index("mouse_barcode"),
                       whole_blood.set_index("mouse_barcode"),
                       lymph_selectedpop.set_index("mouse_barcode"),
                       lymphocytes.set_index("mouse_barcode"),
                       behaviour.set_index("mouse_barcode"),
                       myeloids.set_index("mouse_barcode")
                       ],axis=1).reset_index()

merged_df = pd.merge(merged_df[merged_df.columns.difference(metadata_cols)], metadata_df, on="mouse_barcode", how="left")

  dated_sheets[i] = dated_sheets[i].pivot_table(index="mouse_barcode", columns="Date").reset_index()
  dated_sheets[i] = dated_sheets[i].pivot_table(index="mouse_barcode", columns="Date").reset_index()
  dated_sheets[i] = dated_sheets[i].pivot_table(index="mouse_barcode", columns="Date").reset_index()


In [599]:
merged_df.to_csv("../Data Integration/mergeddf_neurowholebloodlymphselectedlymphbehaviourmyeloids.csv")

In [4]:
b_8_10 = merged_df.loc[(merged_df["Batch"] == str(8)) | (merged_df["Batch"] == str(10)), :]

condition_codes = {
    "SS" : 0, 
    "PP" : 1, 
    "PS" : 1, 
    "S" : 0, 
    "P" : 1
}

sex_codes = {
    "F" : 0, 
    "M" : 1
}

b_8_10["Condition"] = [condition_codes[condition] for condition in b_8_10["Condition"]]
b_8_10["Sex"] = [sex_codes[sex] for sex in b_8_10["Sex"]]
b_8_10 = b_8_10.loc[~b_8_10["Average"].isna()].dropna(axis = 1)
b_8_10 = b_8_10.loc[(b_8_10["Condition"] == 0) | (b_8_10["Condition"] == 1)]

KeyError: nan

In [614]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import statsmodels.api as sm

class BorutaFeatureSelection:
    def __init__(self, n_splits=5, n_repeats=10, random_state=42):
        self.n_splits = n_splits
        self.n_repeats = n_repeats
        self.random_state = random_state
        self.feature_ranks = None
        self.top_features = None
        self.scaler_X = StandardScaler()
        self.scaler_y = StandardScaler()

    def fit(self, X, y, k=10):
        # Scale the features and target variable
        X_scaled = self.scaler_X.fit_transform(X)
        y_scaled = self.scaler_y.fit_transform(y.values.reshape(-1, 1)).ravel()
        
        self.feature_ranks = np.zeros(X.shape[1])
        
        for _ in range(self.n_repeats):
            kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
            
            for train_index, _ in kf.split(X_scaled):
                X_train, y_train = X_scaled[train_index], y_scaled[train_index]
                
                rf = RandomForestRegressor(n_jobs=-1)
                selector = BorutaPy(rf)
                selector.fit(X_train, y_train)
                
                self.feature_ranks += selector.ranking_
        
        self.feature_ranks /= (self.n_splits * self.n_repeats)
        top_k_indices = np.argsort(self.feature_ranks)[:k]
        self.top_features = [endo.columns[i] for i in top_k_indices]
        
    def get_top_features(self):
        return self.top_features

# Creating basic model - Boruta feature selection with scaling
endo = b_8_10.loc[:, b_8_10.columns.difference(metadata_cols)].drop(["mouse_barcode", "Average"], axis=1)
endo = endo / endo.max()
exo = b_8_10["Average"]

selector = BorutaFeatureSelection(n_splits=5, n_repeats=1)
selector.fit(endo.values, exo, k=10)
top_features = selector.get_top_features()

# Create a new dataframe with selected features
selected_features = endo[top_features]

In [622]:
import statsmodels.api as sm

# Create a new dataframe with selected features
# selected_features = pd.concat([endo[top_features[:5]],b_8_10["Sex"]], axis = 1)

selected_features = pd.concat([
    endo[["CD4+ CD44+_Freq of Live cells | Week 4", "OF_MaxAcceleration"]], 
    b_8_10["Sex"]
], axis = 1)

# Add a constant term to the selected features
X = sm.add_constant(selected_features)

# Fit the linear regression model using RLM
model = sm.OLS(exo/exo.max(), X)
results = model.fit()

# Print the model summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                Average   R-squared:                       0.268
Model:                            OLS   Adj. R-squared:                  0.187
Method:                 Least Squares   F-statistic:                     3.298
Date:                Thu, 02 May 2024   Prob (F-statistic):             0.0354
Time:                        17:24:14   Log-Likelihood:                 10.083
No. Observations:                  31   AIC:                            -12.17
Df Residuals:                      27   BIC:                            -6.430
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

In [7]:
import plotly.express as px
batch_column_map = {
    "6": "CD4+ CD44+_Freq of Live cells | wk8",
    "7": "CD4+ CD44+_Freq of Live cells | wk8",
    "8": "CD4+ CD44+_Freq of Live cells | wk8",
    "10": "CD4+ CD44+_Freq of Live cells | wk8",
    "16": "CD4+ CD44+_Freq of Live cells | wk8"
}

x_axis_map = {
    "6": "Average",
    "7": "Lateral",
    "8": "Average",
    "10": "Average",
    "16": "Average"
}

y_axis = "Rotarod_Time_Becky"

# Extract the required columns from the merged dataframe
plot_data = merged_df.loc[
    (merged_df["Batch"] == str(10)) |
    (merged_df["Batch"] == str(16)) |
    (merged_df["Batch"] == str(6)),
    ["mouse_barcode", "Batch", y_axis]
]

# Map the appropriate column based on the "Batch" value
plot_data["colour"] = plot_data["Batch"].map(batch_column_map)
plot_data["x_axis"] = plot_data["Batch"].map(x_axis_map)
plot_data["colour_value"] = plot_data.apply(lambda row: merged_df.loc[row.name, row["colour"]], axis=1)
plot_data["x_axis_value"] = plot_data.apply(lambda row: merged_df.loc[row.name, row["x_axis"]], axis=1)
plot_data.dropna(subset=["colour_value", "x_axis_value"], inplace=True)
# plot_data = plot_data.drop(index = 69)

# Define the color scale
color_scale = [(0, '#2c5784'), (1, '#cf2086')]

fig = px.scatter(
    data_frame=plot_data,
    x="x_axis_value",
    y=y_axis,
    color="colour_value",
    color_continuous_scale=color_scale,
    range_color=[plot_data["colour_value"].min(), plot_data["colour_value"].max()],
    title=plot_data["x_axis"].iloc[0] + " vs " + y_axis + " by " + plot_data["colour"].iloc[0],
    labels={"y_axis": y_axis,
            "x_axis_value": plot_data["x_axis"].iloc[0],
            "colour_value": plot_data["colour"].iloc[0]},
    trendline="ols"  # Add the regression line
)

# Update the marker size
fig.update_traces(marker=dict(size=20))

# Update the layout
fig.update_layout(
    plot_bgcolor='white',
    width=800,
    height=600,
    title=dict(x=0.5, font=dict(size=20)),
    xaxis=dict(
        title=dict(font=dict(size=16)),
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        title=dict(font=dict(size=16)),
        tickfont=dict(size=14)
    ),
    coloraxis_colorbar=dict(title="CD4+ CD44+_Freq of Live cells | wk8")
)

# Display the plot
fig.show()

fig.write_image(f"../Data Integration/{plot_data['x_axis'].unique()[0]}vs{y_axis}by_cd4cd44_b61016.svg")

In [9]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

reg_data = plot_data[['colour_value', 'x_axis_value', 'Rotarod_Time_Becky']].dropna()

# Add a constant term to the independent variables
# X = sm.add_constant(reg_data[['colour_value', 'Rotarod_Time_Becky']])
scaler = StandardScaler()
reg_data[['colour_value', 'Rotarod_Time_Becky', 'x_axis_value']] = scaler.fit_transform(reg_data[['colour_value', 'Rotarod_Time_Becky', 'x_axis_value']])

X = reg_data[["colour_value"]]
y = reg_data['x_axis_value']

# Perform OLS regression
model = sm.RLM(y, X)
results = model.fit()

# Print the regression summary
# print("OLS Regression Results:")
print(results.summary())

                    Robust linear Model Regression Results                    
Dep. Variable:           x_axis_value   No. Observations:                   30
Model:                            RLM   Df Residuals:                       29
Method:                          IRLS   Df Model:                            0
Norm:                          HuberT                                         
Scale Est.:                       mad                                         
Cov Type:                          H1                                         
Date:                Fri, 31 May 2024                                         
Time:                        08:54:16                                         
No. Iterations:                    15                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
colour_value    -0.3538      0.157     -2.249   