## Imports

In [2]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, load_metric
import datetime
import os
import time
import numpy as np
import evaluate
import accelerate
import torch
import re
import plotly.graph_objects as go

## Load Data

In [2]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/Logisitic Regression


In [79]:
"""
Load Narrative Data
"""
os.chdir('../../../../savsnet_resources/pickles')

In [80]:
df_narratives = pd.read_pickle('narrative_pickle.pkl.gz', compression='gzip').drop_duplicates(subset='savsnet_consult_id', keep='first')
df_extras = pd.read_pickle('extras_df.pkl.gz', compression='gzip').drop_duplicates(subset='savsnet_consult_id')[['savsnet_consult_id', 'savsnet_animal_id','species','breed', 'age_at_consult', 'gender', 'neutered', 'mpc', 'savsnet_consult_id_count']]
#Filter for just dog records
df_extras = df_extras[df_extras.species == 'dog']

In [81]:
#Join data
df_narratives_and_extras = df_narratives.set_index('savsnet_consult_id').join(df_extras.set_index('savsnet_consult_id'), how='inner')

df_narratives_and_extras.reset_index()

Unnamed: 0,savsnet_consult_id,item_text,consult_record,pk,consult_record_date,savsnet_animal_id,species,breed,age_at_consult,gender,neutered,mpc,savsnet_consult_id_count
0,71631,"""O worried that she has been limping on RH for...",230515,2040019,2014-06-10 14:10:05+00:00,49526,dog,Crossbreed,10.35,female,no,trauma,1
1,71644,"""booster and ears. v reactive for exam, snarli...",230516,2040021,2014-06-10 14:14:03+00:00,49533,dog,Heeler (generic),6.31,male,no,vaccination,1
2,71660,"""o was stroking chest last night and felt bump...",230517,2040023,2014-06-10 14:18:51+00:00,49541,dog,Jack Russell Terrier,3.45,male,no,other_healthy,1
3,71675,"""1st vacc. bar, biop two weeks, was puppy from...",230518,2040024,2014-06-10 14:25:12+00:00,49550,dog,Crossbreed,0.17,male,no,vaccination,2
4,88118,"""2nd vacc. bar, doing well at home, nothing ab...",230519,2040025,2014-06-24 14:05:25+00:00,49550,dog,Crossbreed,0.21,male,no,vaccination,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6081341,11517687,CONSULT: HPC 6 month check up. Any concerns fr...,10142293,16553510,2024-01-30 12:01:22+00:00,3064938,dog,Crossbreed,1.80,male,yes,other_healthy,2
6081342,11516812,Euthanasia. Booked in for PTS. As soon as went...,10142295,16553511,2024-01-30 10:00:29+00:00,3170851,dog,Crossbreed,10.33,male,no,tumour,3
6081343,11516407,NO MURMUR HEARD TODAY.,10142292,16553513,2024-01-30 09:16:27+00:00,2997060,dog,Dachshund,0.97,female,no,other_healthy,2
6081344,11512021,Following todays examination: Milbemycin Oxime...,10135039,16553515,2024-01-27 15:25:42+00:00,1590416,dog,Spaniel (Cocker),8.71,male,no,post_op,2


In [82]:
# Navigate back to home directory
os.chdir('../../DogBERT/Classifiers/Pseudomonas_Otitis/Logisitic Regression')

In [8]:
# Create data frame of just narrative free text
df_text = df_narratives_and_extras[['item_text']]

In [9]:
print(len(df_text))

6081346


## Search Dataset using Regex

In [10]:
# Load regex to filter free text
pseudomonas_regex = '\b[ps]{1,2}[eu]{1,2}domon.+\b(?:ear|ot[io]t)|\b(?:ear|ot[io]t).+\b[ps]{1,2}[eu]{1,2}domon'
pseudomonas_regex_2 = 'p?s[eu]{1,2}domon.*\W(ear|otitis)|(ear|otitis).*\Wp?s[eu]{1,2}domon'

In [11]:
# Apply regex searches
# df_narratives_and_extras['psoe_regex_found'] = df_narratives_and_extras['item_text'].apply(lambda x: bool(re.search(pseudomonas_regex, x)))
df_narratives_and_extras['psoe_regex2_found'] = df_narratives_and_extras['item_text'].apply(lambda x: bool(re.search(pseudomonas_regex_2, x)))

In [14]:
# Filter to find records containing pseudomonas signals. Use pseudomonas_regex_2
df_text_filt = df_narratives_and_extras[df_narratives_and_extras['psoe_regex2_found']==True]

In [15]:
print(f'Number of Pseudomonas Records: {len(df_text_filt)}')

Number of Pseudomonas Records: 4568


In [16]:
print(max(df_narratives_and_extras['consult_record_date']))

2024-01-30 15:25:54+00:00


## Join in DataLab Labels

In [17]:
# Get just the text of the regex filtered records
df_text_re2 = df_text_filt[['item_text']]

In [18]:
# Navigate to labeled records, load and return to home
os.chdir('..')
df_psoe = pd.read_excel('adamwilliams-OtitisStudyPseudomonas (1).xls', sheet_name='Case Data', index_col=False)
os.chdir('Logisitic Regression')

In [19]:
print(df_psoe['PseudomonasOtitis'].value_counts())

PseudomonasOtitis
✓    699
?    156
!     78
⍉     52
Name: count, dtype: int64


In [20]:
# Join in tick labels
df_dataset = pd.merge(df_text_re2, df_psoe, on='savsnet_consult_id', how='left')[['savsnet_consult_id', 'item_text', 'PseudomonasOtitis']]

In [21]:
print(df_dataset['PseudomonasOtitis'].value_counts())

PseudomonasOtitis
✓    566
?    127
!     71
⍉     30
Name: count, dtype: int64


## Load DogBERT Multi-Class Unweighted Classifier

In [24]:
# Get non-binary classifier directory
model_dir = "/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/Multi_Class Classifier/Weighted Loss/DogBERT_PSOE_Multi_Class_Classifier_Weighted"

In [25]:
# Create label2id, id2label, tokenizer and model objects
label2id = {'?': 0, '✓': 1, '!': 2, '⍉': 3}
id2label = {0: '?', 1: '✓', 2: '!', 3:'⍉'}
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=4, id2label=id2label, label2id=label2id)

In [26]:
def predict_sentiment(text):
  """
  Function to predidict class of a given piece of text
  """
  inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
  outputs = model(**inputs)
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
  predicted_class = torch.argmax(predictions).item()
  confidence_score = predictions.squeeze()[predicted_class].item()
  return predicted_class, confidence_score

In [27]:
# Classify records using multi-class classifier
df_dataset["DogBERT_unweighted_predicted_pseudomonas_otitis_mc"], df_dataset["DogBERT_unweighted_mc_confidence_score"] = zip(*df_dataset["item_text"].apply(predict_sentiment))

In [28]:
df_dataset = df_dataset.rename(columns={'DogBERT_unweighted_predicted_pseudomonas_otitis_mc':'DogBERT_weighted_predicted_pseudomonas_otitis_mc', 'DogBERT_unweighted_mc_confidence_score':'DogBERT_weighted_mc_confidence_score'})

In [30]:
from numba import cuda
import gc
torch.cuda.empty_cache()
gc.collect()

683

In [31]:
print(df_dataset.head())

   savsnet_consult_id                                          item_text  \
0               95721  "<<identifier>> otitis externa been really goo...   
1              129032  "HL issues and ears. been slowing up last few ...   
2              169684  "bilateral oe thickened ear canals cleaning is...   
3              117477  "left eear purulent otitis suspecrt pseudomona...   
4              115552  "Bilat OE again. Not purulent or ulcerated, so...   

  PseudomonasOtitis  DogBERT_weighted_predicted_pseudomonas_otitis_mc  \
0                 ?                                                 1   
1                 ✓                                                 1   
2                 ✓                                                 1   
3                 ✓                                                 1   
4                 !                                                 2   

   DogBERT_weighted_mc_confidence_score  
0                              0.960905  
1                   

In [32]:
df_dataset.to_csv('multi_class_classified.csv')

In [3]:
df_dataset = pd.read_csv('multi_class_classified.csv', index_col=False)

## Examine Repeat Consults

In [41]:
# Get savsnet consult ids for pseudomonas positive records classified using the binary classifier
pseudomaonas_otitis_positive_scids = df_dataset['savsnet_consult_id'][df_dataset['DogBERT_weighted_predicted_pseudomonas_otitis_mc'] == 1].to_list()

In [42]:
# get info about records of dogs with pseudomonas otitis
psuedomonas_dogs = df_extras[df_extras['savsnet_consult_id'].isin(pseudomaonas_otitis_positive_scids)]

In [43]:
# examine how many unique dogs are treated in all pseudomonas records 
print(len(psuedomonas_dogs))
print(len(psuedomonas_dogs['savsnet_animal_id'].unique()))

3070
2584


In [44]:
# Remove the option if a missing breed
psuedomonas_dogs = psuedomonas_dogs[psuedomonas_dogs.breed != 'missing']
# psuedomonas_dogs_unique = psuedomonas_dogs_unique[psuedomonas_dogs_unique.breed != 'missing']

In [45]:
# examine how many unique dogs are treated in all pseudomonas records 
print(len(psuedomonas_dogs))
print(len(psuedomonas_dogs['savsnet_animal_id'].unique()))

3028
2549


In [50]:
# Find duplicated animal_ids
duplicate_ids = psuedomonas_dogs[psuedomonas_dogs.duplicated(subset=['savsnet_animal_id'], keep=False)]

In [51]:
print(len(psuedomonas_dogs)-len(psuedomonas_dogs['savsnet_animal_id'].unique()))
print(len(duplicate_ids))

479
818


In [53]:
print(len(duplicate_ids['savsnet_animal_id'].unique()))

339


In [58]:
print(duplicate_ids['savsnet_animal_id'].value_counts())

savsnet_animal_id
1582304    8
959727     7
2501102    7
196342     7
1575303    6
          ..
949109     2
400474     2
596237     2
1103626    2
2927348    2
Name: count, Length: 339, dtype: int64


In [62]:
print(len(psuedomonas_dogs[psuedomonas_dogs['savsnet_animal_id']==1575303]))

6


In [65]:
# Randomly select one row per animal_id
sampled_df = duplicate_ids.groupby('savsnet_animal_id').sample(n=1, random_state=42)

In [66]:
print(len(sampled_df))

339


In [67]:
print(sampled_df.head())

         savsnet_consult_id  savsnet_animal_id species  \
106718               116647               2012     dog   
353562               382442               6814     dog   
2444326             8025639               9265     dog   
4489367             4805200              10096     dog   
3746016             4024413              11275     dog   

                               breed  age_at_consult  gender neutered  \
106718   West Highland White Terrier           11.73    male      yes   
353562                 Gordon Setter            6.57  female      yes   
2444326                     Cavachon            9.56    male       no   
4489367                      Bulldog            5.07    male       no   
3746016             Spaniel (Cocker)           12.65  female       no   

                   mpc  savsnet_consult_id_count  
106718    other_unwell                        11  
353562   other_healthy                        31  
2444326   other_unwell                         8  
4489367 

In [68]:
# Remove all duplicate animal_ids from psuedomonas_dogs then combine with sampled_df
psuedomonas_dogs_deduped = psuedomonas_dogs.drop_duplicates(subset='savsnet_animal_id', keep=False)

In [69]:
psuedomonas_dogs_unique = pd.concat([psuedomonas_dogs_deduped, sampled_df])

In [71]:
print(psuedomonas_dogs_unique.head())

       savsnet_consult_id  savsnet_animal_id species                 breed  \
3208                10766               8479     dog                   Pug   
15334               23145              17954     dog  Retriever (Labrador)   
17370               25253              19487     dog                 Cross   
20070               28015              21352     dog      Spaniel (Cocker)   
20954               28919              21993     dog  Retriever (Labrador)   

       age_at_consult  gender neutered           mpc  savsnet_consult_id_count  
3208            11.84  female      yes  other_unwell                         1  
15334           10.21  female       no      pruritus                        12  
17370           11.00  female       no      pruritus                         1  
20070           14.31    male      yes  other_unwell                         1  
20954           10.90  female      yes  other_unwell                         4  


In [72]:
# Get list of number of dogs with pseudomonas otitis recorded by breed
psuedomonas_breeds_unique = psuedomonas_dogs_unique['breed'].value_counts()
# Get list of number of records with pseudomonas otitis recorded by breed
pseudomonas_breeds = psuedomonas_dogs['breed'].value_counts()

In [73]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(psuedomonas_breeds_unique)

breed
Spaniel (Cocker)                       495
Crossbreed                             291
Retriever (Labrador)                   279
West Highland White Terrier            140
Spaniel (Springer)                     110
Cockapoo                                84
German Shepherd Dog (Alsatian)          69
Labradoodle                             55
Cavalier King Charles Spaniel           55
Basset Hound                            55
Retriever (Golden)                      54
Shih Tzu                                54
Newfoundland                            37
Staffordshire Bull Terrier              37
Border Terrier                          36
Bulldog                                 34
Pug                                     33
French Bulldog                          30
Jack Russell Terrier                    27
Bichon Frise                            27
Tibetan Terrier                         26
Italian Spinone                         25
Cross                                   24
Beagl

In [74]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(pseudomonas_breeds)

breed
Spaniel (Cocker)                       650
Retriever (Labrador)                   332
Crossbreed                             331
West Highland White Terrier            164
Spaniel (Springer)                     140
Cockapoo                               109
German Shepherd Dog (Alsatian)          75
Labradoodle                             65
Basset Hound                            65
Retriever (Golden)                      63
Cavalier King Charles Spaniel           60
Shih Tzu                                59
Newfoundland                            45
Border Terrier                          42
Staffordshire Bull Terrier              38
Bulldog                                 35
Pug                                     34
Bichon Frise                            33
Poodle (generic)                        31
French Bulldog                          30
Tibetan Terrier                         30
Italian Spinone                         29
Spaniel (American Cocker)               28
Jack 

In [75]:
# Compare number of breeds in each list (should match)
print(len(pseudomonas_breeds))
print(len(psuedomonas_breeds_unique))

111
111


In [76]:
# Create a dataframe of all pseudomonas positive naratives
psuedomonas_narratives = df_narratives[df_narratives['savsnet_consult_id'].isin(pseudomaonas_otitis_positive_scids)]

## Create Control Set From All Records Not In Pseudomonas Set

In [83]:
# Remove consults with missing breed from full dataset
df_narratives_and_extras_true_breed = df_narratives_and_extras[df_narratives_and_extras['breed'] != 'missing']
print(len(df_narratives_and_extras))
print(len(df_narratives_and_extras_true_breed))

6081346
5993268


In [84]:
# get list of all consult ids and all the first consult ids for each dog 
scids = df_narratives_and_extras_true_breed.index.to_list()
psuedomonas_dogs_unique_scids = psuedomonas_dogs_unique['savsnet_consult_id'].to_list()

In [85]:
# Remove any ids that are pseudomonas positive
def remove_numbers(list1, list2):
    return [num for num in list1 if num not in list2]

scids_control = remove_numbers(scids, psuedomonas_dogs_unique_scids)

In [86]:
# This is all control savsnet_consult_ids
control_df_all = df_narratives_and_extras_true_breed[df_narratives_and_extras_true_breed.index.isin(scids_control)]

In [87]:
print(len(control_df_all))

5990719


In [88]:
# Inspect breeds in one list but not the other
control_breeds = set(control_df_all['breed'].to_list())
test_breeds = set(psuedomonas_dogs_unique['breed'].to_list())

In [89]:
print(f'Breeds in control set but not test: {control_breeds - test_breeds}')
print(f'Breeds in test but not control: {test_breeds - control_breeds}')

Breeds in control set but not test: {'Norwich Terrier', 'Japanese Chin', 'Bloodhound', 'English Toy Terrier (Black & Tan)', 'Neapolitan Mastiff', 'Anatolian Shepherd Dog', 'Dachshund (Miniature Wire-Haired)', 'Catalan Sheepdog (Imp)', 'Dachshund (Long-Haired)', 'Transylvanian Hound', 'Portuguese Podengo', 'Pyrenean Sheepdog (Long Haired)', 'Fox Terrier (Smooth)', 'Jackapoo', 'Bull Terrier (Miniature)', 'Havanese', 'Kerry Blue Terrier', 'Spaniel (Sussex)', 'Keeshond', 'Samoyed', 'Basset Fauve De Bretagne', 'Afghan Hound', 'Basset Griffon Vendeen (Petit)', 'Portuguese Pointer', 'Pugalier', 'New Zealand Huntaway', 'Spanish Water Dog', 'Italian Greyhound', 'Berger Picard', 'Japanese Spitz', 'Russian Black Terrier', 'Briard', 'Bolognese', 'Pit Bull Terrier', 'Welsh Corgi (Cardigan)', 'Australian Cattle Dog', 'Cane Corso', 'Hound (Generic)', 'Sealyham Terrier', 'Whippet', 'Bergamasco (Imp)', 'Belgian Shepherd Dog (Tervueren)', 'Irish Wolfhound', 'Maremma Sheepdog', 'Trailhound', 'Lowchen (Li

In [90]:
def match_breeds(test_df, control_df):
    """
    Function to take the breed column of two dataframes and change the entry of any non-matching breeds to 'other'
    """
    unique_breeds_df1 = set(test_df['breed'])
    unique_breeds_df2 = set(control_df['breed'])
    
    # Identify breeds exclusive to one DataFrame
    breeds_only_in_df1 = unique_breeds_df1 - unique_breeds_df2
    breeds_only_in_df2 = unique_breeds_df2 - unique_breeds_df1
    
    # Replace exclusive breeds with 'other'
    # Replace exclusive breeds with 'other' using loc
    test_df.loc[test_df['breed'].isin(breeds_only_in_df1), 'breed'] = 'other'
    control_df.loc[control_df['breed'].isin(breeds_only_in_df2), 'breed'] = 'other'
    return test_df, control_df

In [91]:
# match breeds
psuedomonas_dogs_unique, control_df_all = match_breeds(psuedomonas_dogs_unique, control_df_all)

In [92]:
# Apply label to show pseudomonas records and non psudomonas records
control_df_all['psuedomonas_id'] = 0
psuedomonas_dogs_unique['psuedomonas_id'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_df_all['psuedomonas_id'] = 0


In [93]:
# Create dataset for logisitc regression
control_all_and_psoe_df = pd.concat([control_df_all, psuedomonas_dogs_unique], axis=0)

In [94]:
# Create combined sex/neuter status column
control_all_and_psoe_df['Sex_Neuter_Status'] = control_all_and_psoe_df['gender'] + '_' + control_all_and_psoe_df['neutered'].replace({'yes': 'neutered', 'no': 'entire'})

In [95]:
"""
Split age at consult into a categorical variable
‘Puppies’ (XP) aged 0 to < 6 months, ‘Juveniles’ (XJ) aged 6 to < 12 months,
‘Young Adults’ (XY) aged 12 to < 24 months, ‘Mature Adults’ (XM) aged 2 to < 7 years,
‘Senior’ (XS) aged 7 to < 12 years and ‘Geriatric’ (XG) aged ≥ 12.
"""
# Define age bins and labels
bins = [0, 0.5, 1, 2, 7, 12, float('inf')]
labels = ['Puppy', 'Juvenile', 'Young Adult', 'Mature Adult', 'Senior', 'Geriatric']

# Create a new column 'Age_Category' using pd.cut
control_all_and_psoe_df['age_category_at_consult'] = pd.cut(control_all_and_psoe_df['age_at_consult'], bins=bins, labels=labels, right=False)


In [96]:
import statsmodels.formula.api as smf
import numpy as np

model = smf.logit("psuedomonas_id ~ C(age_category_at_consult, Treatment('Puppy')) + C(Sex_Neuter_Status, Treatment('female_neutered')) + C(breed, Treatment('Crossbreed'))", data=control_all_and_psoe_df)
results = model.fit()
results.summary()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


         Current function value: 0.003469
         Iterations: 35




0,1,2,3
Dep. Variable:,psuedomonas_id,No. Observations:,5993254.0
Model:,Logit,Df Residuals:,5993134.0
Method:,MLE,Df Model:,119.0
Date:,"Wed, 29 Jan 2025",Pseudo R-squ.:,0.0692
Time:,16:50:14,Log-Likelihood:,-20790.0
converged:,False,LL-Null:,-22336.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-11.6051,0.286,-40.552,0.000,-12.166,-11.044
"C(age_category_at_consult, Treatment('Puppy'))[T.Juvenile]",0.5938,0.392,1.513,0.130,-0.175,1.363
"C(age_category_at_consult, Treatment('Puppy'))[T.Young Adult]",1.6413,0.304,5.391,0.000,1.045,2.238
"C(age_category_at_consult, Treatment('Puppy'))[T.Mature Adult]",2.9858,0.280,10.660,0.000,2.437,3.535
"C(age_category_at_consult, Treatment('Puppy'))[T.Senior]",3.4771,0.280,12.422,0.000,2.929,4.026
"C(age_category_at_consult, Treatment('Puppy'))[T.Geriatric]",3.5140,0.283,12.418,0.000,2.959,4.069
"C(Sex_Neuter_Status, Treatment('female_neutered'))[T.female_entire]",0.2870,0.069,4.140,0.000,0.151,0.423
"C(Sex_Neuter_Status, Treatment('female_neutered'))[T.male_entire]",0.5044,0.056,8.983,0.000,0.394,0.614
"C(Sex_Neuter_Status, Treatment('female_neutered'))[T.male_neutered]",0.2719,0.049,5.565,0.000,0.176,0.368


In [97]:
# Get the summary table
summary_table = results.summary2().tables[1]

In [98]:
# Create a DataFrame from the summary table
results_df = pd.DataFrame(summary_table)

print(results_df.head())

                                                        Coef.  Std.Err.  \
Intercept                                          -11.605114  0.286175   
C(age_category_at_consult, Treatment('Puppy'))[...   0.593793  0.392366   
C(age_category_at_consult, Treatment('Puppy'))[...   1.641342  0.304479   
C(age_category_at_consult, Treatment('Puppy'))[...   2.985837  0.280104   
C(age_category_at_consult, Treatment('Puppy'))[...   3.477138  0.279910   

                                                            z         P>|z|  \
Intercept                                          -40.552445  0.000000e+00   
C(age_category_at_consult, Treatment('Puppy'))[...   1.513363  1.301874e-01   
C(age_category_at_consult, Treatment('Puppy'))[...   5.390666  7.019698e-08   
C(age_category_at_consult, Treatment('Puppy'))[...  10.659752  1.570157e-26   
C(age_category_at_consult, Treatment('Puppy'))[...  12.422347  1.976762e-35   

                                                       [0.025     0.975]  

In [99]:
print(results_df.columns.to_list())

['Coef.', 'Std.Err.', 'z', 'P>|z|', '[0.025', '0.975]']


In [100]:
# Extract specific columns
results_df = results_df[['Coef.', 'Std.Err.', 'z', 'P>|z|', '[0.025', '0.975]']]

# Rename columns
results_df.columns = ['Coefficient', 'Std. Error', 'z-score', 'p-value', 'Lower CI', 'Upper CI']

# Calculate odds ratios and confidence intervals
results_df['Odds Ratio'] = np.exp(results_df['Coefficient'])
results_df['Lower CI'] = np.exp(results_df['Lower CI'])
results_df['Upper CI'] = np.exp(results_df['Upper CI'])

print(results_df)

                                                    Coefficient  Std. Error  \
Intercept                                            -11.605114    0.286175   
C(age_category_at_consult, Treatment('Puppy'))[...     0.593793    0.392366   
C(age_category_at_consult, Treatment('Puppy'))[...     1.641342    0.304479   
C(age_category_at_consult, Treatment('Puppy'))[...     2.985837    0.280104   
C(age_category_at_consult, Treatment('Puppy'))[...     3.477138    0.279910   
...                                                         ...         ...   
C(breed, Treatment('Crossbreed'))[T.Weimaraner]       -0.826137    0.709604   
C(breed, Treatment('Crossbreed'))[T.Welsh Terrier]     1.379301    0.451342   
C(breed, Treatment('Crossbreed'))[T.West Highla...     1.142720    0.103340   
C(breed, Treatment('Crossbreed'))[T.Yorkshire T...    -0.746895    0.256876   
C(breed, Treatment('Crossbreed'))[T.other]           -10.035165   23.338082   

                                                   

In [102]:
results_df = results_df.sort_values(by='Odds Ratio', ascending=False)

## Format Odds Ratios DataFrame

In [103]:
# Set the index as a column
results_df['Parameter'] = results_df.index
results_df['Parameter'] = results_df['Parameter'].astype(str)
# Reset the index
results_df = results_df.reset_index(drop=True)

In [104]:
results_df['Parameter'] = results_df['Parameter'].replace("C(breed, Treatment('Crossbreed'))", "", regex=True)

In [105]:
parameter_list = results_df['Parameter'].to_list()

In [106]:
def extract_text_between_brackets(text):
  match = re.search(r'\[(.*?)\]', text)
  if match:
    return match.group(1)
  else:
    return None

def extract_parameter(parameter_list):
    output_list = []
    for parameter in parameter_list:
        if extract_text_between_brackets(parameter) != None:
            output_list.append(extract_text_between_brackets(parameter))
        else:
            output_list.append(parameter)
    return output_list

parameter_reformat = extract_parameter(parameter_list)

In [107]:
results_df["Parameter_refrom"] = parameter_reformat

In [108]:
results_df["Parameter_refrom"] = results_df["Parameter_refrom"].replace("T.", "", regex=True)

In [109]:
results_df = results_df.drop('Parameter', axis=1)

In [110]:
results_df["Parameter_refrom"] = results_df["Parameter_refrom"].replace("rrier", "Terrier", regex=True)
results_df["Parameter_refrom"] = results_df["Parameter_refrom"].replace("betan", "Tibetan", regex=True)
results_df["Parameter_refrom"] = results_df["Parameter_refrom"].replace("Shih u", "Shih Tzu", regex=True)
results_df["Parameter_refrom"] = results_df["Parameter_refrom"].replace("other", "Other", regex=True)
results_df["Parameter_refrom"] = results_df["Parameter_refrom"].replace("male_neutered", "Male Neutered")
results_df["Parameter_refrom"] = results_df["Parameter_refrom"].replace("male_entire", "Male Entire")
results_df["Parameter_refrom"] = results_df["Parameter_refrom"].replace("female_entire", "Female Entire")

In [111]:
print(results_df.head())

   Coefficient  Std. Error    z-score       p-value   Lower CI     Upper CI  \
0     5.546736    1.050871   5.278225  1.304411e-07  32.689883  2011.036716   
1     4.456449    0.719333   6.195248  5.819319e-10  21.043280   352.946385   
2     4.273418    1.012520   4.220577  2.436777e-05   9.864233   522.131759   
3     3.805372    0.715123   5.321284  1.030376e-07  11.064667   182.543307   
4     3.514043    0.282986  12.417745  2.093790e-35  19.286427    58.479996   

   Odds Ratio       Parameter_refrom  
0  256.399208  Glen Of Imaal Terrier  
1   86.180912             Otterhound  
2   71.766493          Cesky Terrier  
3   44.941973         Hungarian Puli  
4   33.583778              Geriatric  


In [112]:
results_df = results_df.iloc[:,[7, 6, 4, 5, 0, 1, 2, 3]]

In [113]:
print(results_df.head())

        Parameter_refrom  Odds Ratio   Lower CI     Upper CI  Coefficient  \
0  Glen Of Imaal Terrier  256.399208  32.689883  2011.036716     5.546736   
1             Otterhound   86.180912  21.043280   352.946385     4.456449   
2          Cesky Terrier   71.766493   9.864233   522.131759     4.273418   
3         Hungarian Puli   44.941973  11.064667   182.543307     3.805372   
4              Geriatric   33.583778  19.286427    58.479996     3.514043   

   Std. Error    z-score       p-value  
0    1.050871   5.278225  1.304411e-07  
1    0.719333   6.195248  5.819319e-10  
2    1.012520   4.220577  2.436777e-05  
3    0.715123   5.321284  1.030376e-07  
4    0.282986  12.417745  2.093790e-35  


In [114]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(results_df)

                        Parameter_refrom  Odds Ratio      Lower CI  \
0                  Glen Of Imaal Terrier  256.399208  3.268988e+01   
1                             Otterhound   86.180912  2.104328e+01   
2                          Cesky Terrier   71.766493  9.864233e+00   
3                         Hungarian Puli   44.941973  1.106467e+01   
4                              Geriatric   33.583778  1.928643e+01   
5                                 Senior   32.366944  1.870002e+01   
6                   Bouvier Des Flandres   24.812591  3.455204e+00   
7                        Bracco Italiano   22.116471  3.083248e+00   
8                           Mature Adult   19.803071  1.143689e+01   
9                        Italian Spinone   19.321131  1.282331e+01   
10                          Newfoundland   19.246766  1.364819e+01   
11             Spaniel (American Cocker)   15.085005  9.676370e+00   
12                          Basset Hound   14.365713  1.076060e+01   
13          Hungaria

In [115]:
 results_df = results_df.rename(columns={"Parameter_refrom": "Parameter"})

In [116]:
results_df = results_df.set_index('Parameter')

In [117]:
# Reorder the DataFrame
desired_order = ['Juvenile', 'Young Adult', 'Mature Adult', 'Senior', 'Geriatric', 'Female Entire', 'Male Entire', 'Male Neutered']

In [118]:
non_breeds_df = results_df.loc[desired_order] 
sorted_breeds_df = results_df[~results_df.index.isin(desired_order)] # .sort_values('Odds Ratio', ascending=False).index

In [119]:
full_sorted_df = pd.concat([non_breeds_df, sorted_breeds_df], axis=0)

In [120]:
full_sorted_df = full_sorted_df.drop(columns=['Coefficient', 'Std. Error', 'z-score'], axis=1)

In [175]:
df_age_at_consult = full_sorted_df.loc[['Juvenile', 'Young Adult', 'Mature Adult', 'Senior', 'Geriatric']]
df_sex_neuter = full_sorted_df.loc[["Female Entire", "Male Entire", "Male Neutered"]]
df_breed = full_sorted_df.iloc[8:]

In [122]:
print(df_breed)

                       Odds Ratio      Lower CI      Upper CI       p-value
Parameter                                                                  
Glen Of Imaal Terrier  256.399208  3.268988e+01  2.011037e+03  1.304411e-07
Otterhound              86.180912  2.104328e+01  3.529464e+02  5.819319e-10
Cesky Terrier           71.766493  9.864233e+00  5.221318e+02  2.436777e-05
Hungarian Puli          44.941973  1.106467e+01  1.825433e+02  1.030376e-07
Bouvier Des Flandres    24.812591  3.455204e+00  1.781847e+02  1.409935e-03
...                           ...           ...           ...           ...
Chihuahua                0.272643  1.214599e-01  6.120053e-01  1.631788e-03
Border Collie            0.200434  1.067015e-01  3.765079e-01  5.831587e-07
Greyhound                0.075220  1.055949e-02  5.358308e-01  9.799927e-03
Other                    0.000044  5.975465e-25  3.215100e+15  6.672022e-01
Intercept                0.000009  5.204392e-06  1.597923e-05  0.000000e+00

[112 rows x

## Join Case and Control Sizes Into Age

In [219]:
# Define age bins and labels
bins = [0, 0.5, 1, 2, 7, 12, float('inf')]
labels = ['Puppy', 'Juvenile', 'Young Adult', 'Mature Adult', 'Senior', 'Geriatric']

# Create a new column 'Age_Category' using pd.cut
psuedomonas_dogs_unique['age_category_at_consult'] = pd.cut(psuedomonas_dogs_unique['age_at_consult'], bins=bins, labels=labels, right=False)
control_df_all['age_category_at_consult'] = pd.cut(control_df_all['age_at_consult'], bins=bins, labels=labels, right=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [220]:
case_counts_age = psuedomonas_dogs_unique['age_category_at_consult'].value_counts().reset_index()
control_counts_age = control_df_all['age_category_at_consult'].value_counts().reset_index()

In [221]:
print(control_counts_age)

  age_category_at_consult    count
0            Mature Adult  2016484
1                  Senior  1865177
2               Geriatric   712807
3             Young Adult   545583
4                   Puppy   542248
5                Juvenile   308406


In [239]:
print(case_counts_age)

  age_category_at_consult  count
0                  Senior   1201
1            Mature Adult    839
2               Geriatric    419
3             Young Adult     64
4                   Puppy     13
5                Juvenile     13


In [222]:
df_age_at_consult = df_age_at_consult.reset_index()
print(df_age_at_consult.head())

      Parameter  Odds Ratio   Lower CI   Upper CI       p-value
0      Juvenile    1.810844   0.839263   3.907185  1.301874e-01
1   Young Adult    5.162092   2.842191   9.375584  7.019698e-08
2  Mature Adult   19.803071  11.436885  34.289198  1.570157e-26
3        Senior   32.366944  18.700016  56.022363  1.976762e-35
4     Geriatric   33.583778  19.286427  58.479996  2.093790e-35


In [223]:
df_age_at_consult = df_age_at_consult.rename(columns={"Parameter": "age_category_at_consult"})

In [224]:
df_age_at_consult = df_age_at_consult.merge(case_counts_age, on='age_category_at_consult')
df_age_at_consult = df_age_at_consult.merge(control_counts_age, on='age_category_at_consult')

In [225]:
print(df_age_at_consult.head())

  age_category_at_consult  Odds Ratio   Lower CI   Upper CI       p-value  \
0                Juvenile    1.810844   0.839263   3.907185  1.301874e-01   
1             Young Adult    5.162092   2.842191   9.375584  7.019698e-08   
2            Mature Adult   19.803071  11.436885  34.289198  1.570157e-26   
3                  Senior   32.366944  18.700016  56.022363  1.976762e-35   
4               Geriatric   33.583778  19.286427  58.479996  2.093790e-35   

   count_x  count_y  
0       13   308406  
1       64   545583  
2      839  2016484  
3     1201  1865177  
4      419   712807  


In [226]:
df_age_at_consult = df_age_at_consult.rename(columns={"count_x": "case", "count_y": "control"})

In [227]:
print(df_age_at_consult)

  age_category_at_consult  Odds Ratio   Lower CI   Upper CI       p-value  \
0                Juvenile    1.810844   0.839263   3.907185  1.301874e-01   
1             Young Adult    5.162092   2.842191   9.375584  7.019698e-08   
2            Mature Adult   19.803071  11.436885  34.289198  1.570157e-26   
3                  Senior   32.366944  18.700016  56.022363  1.976762e-35   
4               Geriatric   33.583778  19.286427  58.479996  2.093790e-35   

   case  control  
0    13   308406  
1    64   545583  
2   839  2016484  
3  1201  1865177  
4   419   712807  


In [237]:
df_age_at_consult = df_age_at_consult.iloc[:,[0, 5, 6, 1, 2, 3, 4]]

In [133]:
def format_power(x):
    return f'{x:.2e}'

In [238]:
# Create age at consult latex table
df_age_at_consult.to_latex(index=True,
                  float_format="{:.2f}".format,
                  formatters={'p-value':format_power})

'\\begin{tabular}{llrrrrrr}\n\\toprule\n & age_category_at_consult & case & control & Odds Ratio & Lower CI & Upper CI & p-value \\\\\n\\midrule\n0 & Juvenile & 13 & 308406 & 1.81 & 0.84 & 3.91 & 1.30e-01 \\\\\n1 & Young Adult & 64 & 545583 & 5.16 & 2.84 & 9.38 & 7.02e-08 \\\\\n2 & Mature Adult & 839 & 2016484 & 19.80 & 11.44 & 34.29 & 1.57e-26 \\\\\n3 & Senior & 1201 & 1865177 & 32.37 & 18.70 & 56.02 & 1.98e-35 \\\\\n4 & Geriatric & 419 & 712807 & 33.58 & 19.29 & 58.48 & 2.09e-35 \\\\\n\\bottomrule\n\\end{tabular}\n'

## Join Case and Control Sizes Into Sex/Neuter Status

In [203]:
# Create combined sex/neuter status column
control_all_and_psoe_df['Sex_Neuter_Status'] = control_all_and_psoe_df['gender'] + '_' + control_all_and_psoe_df['neutered'].replace({'yes': 'neutered', 'no': 'entire'})

In [204]:
psuedomonas_dogs_unique['Sex_Neuter_Status'] = psuedomonas_dogs_unique['gender'].str.title() + ' ' + psuedomonas_dogs_unique['neutered'].replace({'yes': 'Neutered', 'no': 'Entire'})
control_df_all['Sex_Neuter_Status'] = control_df_all['gender'].str.title() + ' ' + control_df_all['neutered'].replace({'yes': 'Neutered', 'no': 'Entire'})



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [205]:
df_sex_neuter = df_sex_neuter.reset_index()

In [206]:
print(df_sex_neuter.head())

       Parameter  Odds Ratio  Lower CI  Upper CI       p-value
0  Female Entire    1.332487  1.163170  1.526450  3.474800e-05
1    Male Entire    1.655971  1.483404  1.848612  2.630494e-19
2  Male Neutered    1.312406  1.192566  1.444289  2.627361e-08


In [207]:
case_counts_sexn = psuedomonas_dogs_unique['Sex_Neuter_Status'].value_counts().reset_index()
control_counts_sexn = control_df_all['Sex_Neuter_Status'].value_counts().reset_index()

In [243]:
print(case_counts_sexn)
print(control_counts_sexn)

  Sex_Neuter_Status  count
0     Male Neutered    935
1   Female Neutered    765
2       Male Entire    556
3     Female Entire    293
  Sex_Neuter_Status    count
0   Female Neutered  2036750
1     Male Neutered  1967101
2       Male Entire  1113444
3     Female Entire   873424


In [208]:
df_sex_neuter = df_sex_neuter.rename(columns={"Parameter": "Sex_Neuter_Status"})

In [209]:
print(df_sex_neuter)

  Sex_Neuter_Status  Odds Ratio  Lower CI  Upper CI       p-value
0     Female Entire    1.332487  1.163170  1.526450  3.474800e-05
1       Male Entire    1.655971  1.483404  1.848612  2.630494e-19
2     Male Neutered    1.312406  1.192566  1.444289  2.627361e-08


In [210]:
df_sex_neuter = df_sex_neuter.merge(case_counts_sexn, on='Sex_Neuter_Status')
df_sex_neuter = df_sex_neuter.merge(control_counts_sexn, on='Sex_Neuter_Status')

In [211]:
df_sex_neuter = df_sex_neuter.rename(columns={"count_x": "case", "count_y": "control"})

In [240]:
df_sex_neuter = df_sex_neuter.iloc[:,[0, 5, 6, 1, 2, 3, 4]]

In [241]:
print(df_sex_neuter)

  Sex_Neuter_Status  case  control  Odds Ratio  Lower CI  Upper CI  \
0     Female Entire   293   873424    1.332487  1.163170  1.526450   
1       Male Entire   556  1113444    1.655971  1.483404  1.848612   
2     Male Neutered   935  1967101    1.312406  1.192566  1.444289   

        p-value  
0  3.474800e-05  
1  2.630494e-19  
2  2.627361e-08  


In [242]:
# Create sex/neuter latex table
df_sex_neuter.to_latex(index=True,
                  float_format="{:.2f}".format,
                  formatters={'p-value':format_power})

'\\begin{tabular}{llrrrrrr}\n\\toprule\n & Sex_Neuter_Status & case & control & Odds Ratio & Lower CI & Upper CI & p-value \\\\\n\\midrule\n0 & Female Entire & 293 & 873424 & 1.33 & 1.16 & 1.53 & 3.47e-05 \\\\\n1 & Male Entire & 556 & 1113444 & 1.66 & 1.48 & 1.85 & 2.63e-19 \\\\\n2 & Male Neutered & 935 & 1967101 & 1.31 & 1.19 & 1.44 & 2.63e-08 \\\\\n\\bottomrule\n\\end{tabular}\n'

## Join Case and Control Sizes Into Breed

In [176]:
case_counts = psuedomonas_dogs_unique['breed'].value_counts().reset_index()

In [177]:
control_counts = control_df_all['breed'].value_counts().reset_index()
print(control_counts[control_counts['breed'] == 'other'])

   breed   count
7  other  178926


In [178]:
df_breed = df_breed.reset_index()

In [179]:
df_breed = df_breed.rename(columns={"Parameter": "breed"})

In [180]:
df_breed = df_breed.merge(case_counts, on='breed')

In [181]:
df_breed = df_breed.merge(control_counts, on='breed')

In [182]:
df_breed = df_breed.rename(columns={"count_x": "case", "count_y": "control"})

In [161]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_breed)

                                   breed  Odds Ratio   Lower CI     Upper CI  \
0                  Glen Of Imaal Terrier  256.399208  32.689883  2011.036716   
1                             Otterhound   86.180912  21.043280   352.946385   
2                          Cesky Terrier   71.766493   9.864233   522.131759   
3                         Hungarian Puli   44.941973  11.064667   182.543307   
4                   Bouvier Des Flandres   24.812591   3.455204   178.184750   
5                        Bracco Italiano   22.116471   3.083248   158.643827   
6                        Italian Spinone   19.321131  12.823305    29.111536   
7                           Newfoundland   19.246766  13.648189    27.141917   
8              Spaniel (American Cocker)   15.085005   9.676370    23.516812   
9                           Basset Hound   14.365713  10.760604    19.178635   
10          Hungarian Wire Haired Vizsla   14.168230   7.528646    26.663328   
11                 Spaniel (Irish Water)

In [190]:
def remove_inisgnigicant_results(df):
    # Remove values where p-value >= 0.05
    df = df[df['p-value'] < 0.05]
    #Remove values where Lower CI <= 1 & Upper CI >= 1
    mask = ~( (df['Lower CI'] <= 1) & (df['Upper CI'] >= 1) ) 
    df = df[mask]
    return df

In [246]:
df_breed_signifiant = remove_inisgnigicant_results(df_breed)

In [247]:
df_breed_signifiant = df_breed_signifiant.iloc[:,[0, 5, 6, 1, 2, 3, 4]]

In [248]:
print(df_breed_signifiant)

                              breed  case  control  Odds Ratio   Lower CI  \
0             Glen Of Imaal Terrier     1       10  256.399208  32.689883   
1                        Otterhound     2       89   86.180912  21.043280   
2                     Cesky Terrier     1       63   71.766493   9.864233   
3                    Hungarian Puli     2      155   44.941973  11.064667   
4              Bouvier Des Flandres     1      152   24.812591   3.455204   
5                   Bracco Italiano     1      228   22.116471   3.083248   
6                   Italian Spinone    25     5486   19.321131  12.823305   
7                      Newfoundland    37     8307   19.246766  13.648189   
8         Spaniel (American Cocker)    21     4848   15.085005   9.676370   
9                      Basset Hound    55    14740   14.365713  10.760604   
10     Hungarian Wire Haired Vizsla    10     4435   14.168230   7.528646   
11            Spaniel (Irish Water)     1      250   14.052046   1.963705   

In [250]:
print(len(df_breed_signifiant))

57


In [251]:
# Create breed latex table
df_breed_signifiant.to_latex(index=True,
                  float_format="{:.2f}".format,
                  formatters={'p-value':format_power})

'\\begin{tabular}{llrrrrrr}\n\\toprule\n & breed & case & control & Odds Ratio & Lower CI & Upper CI & p-value \\\\\n\\midrule\n0 & Glen Of Imaal Terrier & 1 & 10 & 256.40 & 32.69 & 2011.04 & 1.30e-07 \\\\\n1 & Otterhound & 2 & 89 & 86.18 & 21.04 & 352.95 & 5.82e-10 \\\\\n2 & Cesky Terrier & 1 & 63 & 71.77 & 9.86 & 522.13 & 2.44e-05 \\\\\n3 & Hungarian Puli & 2 & 155 & 44.94 & 11.06 & 182.54 & 1.03e-07 \\\\\n4 & Bouvier Des Flandres & 1 & 152 & 24.81 & 3.46 & 178.18 & 1.41e-03 \\\\\n5 & Bracco Italiano & 1 & 228 & 22.12 & 3.08 & 158.64 & 2.07e-03 \\\\\n6 & Italian Spinone & 25 & 5486 & 19.32 & 12.82 & 29.11 & 1.67e-45 \\\\\n7 & Newfoundland & 37 & 8307 & 19.25 & 13.65 & 27.14 & 8.48e-64 \\\\\n8 & Spaniel (American Cocker) & 21 & 4848 & 15.09 & 9.68 & 23.52 & 4.59e-33 \\\\\n9 & Basset Hound & 55 & 14740 & 14.37 & 10.76 & 19.18 & 4.96e-73 \\\\\n10 & Hungarian Wire Haired Vizsla & 10 & 4435 & 14.17 & 7.53 & 26.66 & 2.08e-16 \\\\\n11 & Spaniel (Irish Water) & 1 & 250 & 14.05 & 1.96 & 100.5

In [194]:
df_breed.to_csv('mc_classifier_otitis_breeds.csv', index=False)

In [3]:
df_breed = pd.read_csv('mc_classifier_otitis_breeds.csv', index_col=False)

## Create Forest Plot

## Try Using Plotly

In [4]:
def create_forestplot(df, variable, y_label, x_label="Odds Ratio", title="Forest Plot", pixels=20):
    """
    Function to create a forest plot from the output of a statsmodels linear regression
    Args:
        df: dataframe containing output of statsmodels linear regression
        variable: variable you wish to plot
        y_label: y axis label
        x_label: x axis label (default: "Odds Ratio")
        title: Title of plot (default: "Forest Plot")
        pixels: Set height assigned to each varaiable (default: 20)
    """
    # Create the figure
    fig = go.Figure()

    # Add markers for point estimates
    fig.add_trace(go.Scatter(
        x=df["Odds Ratio"],
        y=df[variable],
        mode="markers",
        marker=dict(color="blue", size=10),
        name="Odds Ratio"
    ))
    
    # Add upper error bars for confidence intervals
    fig.add_trace(go.Scatter(
        x=df["Odds Ratio"],
        y=df[variable],
        mode="markers",
        line=dict(color="blue", width=0),
        error_x=dict(
        type="data",
        symmetric=False,
        array=df["Upper CI"] - df["Odds Ratio"],
        arrayminus=df["Odds Ratio"] - df["Lower CI"]
        ),
        showlegend=False
    ))

    # Add a vertical line at x=1
    fig.add_shape(
        type="line",
        x0=1, x1=1,  # Start and end points on the x-axis
        y0=0, y1=1,  # Covers the full y-axis range (use relative coordinates if y-axis is categorical)
        line=dict(color="red", width=2, dash="solid"),  # Customize color, width, and style
        xref="x",  # Reference the x-axis
        yref="paper"  # Reference the paper coordinates (0 to 1 for full height)
    )
    
    # Update layout for aesthetics
    fig.update_layout(
        height=pixels * len(df_breed),  # 30 pixels per variable (adjust as needed)
        title=str(title),
        xaxis_title=str(x_label),
        yaxis_title=str(y_label),
        # xaxis_type="log",
        yaxis=dict(autorange="reversed"),  # Reverse order of variables for readability
        template="plotly_white"
    )
    
    return fig

In [5]:
import plotly.io as pio

In [197]:
fig1 = create_forestplot(df_breed, "breed", "Breed", "Odds Ratio", "Pseudomonas Otitis Risk by Breed (Reference=Crossbreed)")
pio.write_image(fig1, "all_breed_forest_plot_log_scale.png")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [198]:
print(os.getcwd())

/opt/jupyterlab/notebooks/DogBERT/Classifiers/Pseudomonas_Otitis/Logisitic Regression


In [9]:
df_breed_no_outliers = df_breed.iloc[6:]

In [200]:
fig2 = create_forestplot(df_breed_no_outliers, "breed", "Breed", "Odds Ratio", "Pseudomonas Otitis Risk by Breed (Reference=Crossbreed)", 20)
pio.write_image(fig2, "breed_outliers_removed_forest_plot_log_scale.png")

In [202]:
print(df_sex_neuter)

               Odds Ratio  Lower CI  Upper CI       p-value
Parameter                                                  
Female Entire    1.332487  1.163170  1.526450  3.474800e-05
Male Entire      1.655971  1.483404  1.848612  2.630494e-19
Male Neutered    1.312406  1.192566  1.444289  2.627361e-08


In [217]:
fig3 = create_forestplot(df_sex_neuter, "Sex_Neuter_Status", "Sex/Neuter Status", "Odds Ratio", "Pseudomonas Otitis Risk by Sex/Neuter Status (Reference=Female Neutered)", 2)
pio.write_image(fig3, "sex_neuter_status_forest_plot_log_scale.png")

In [229]:
fig4 = create_forestplot(df_age_at_consult, "age_category_at_consult", "Age at First Consult", "Odds Ratio", "Pseudomonas Otitis Risk by Age at First Consult (Reference=Puppy)", 3)
pio.write_image(fig4, "age_at_first_consult_forest_plot_log_scale.png")

# Create Plots Excluding p >= 0.05, Lower CI <= 1, Upper CI >= 1

In [6]:
def remove_inisgnigicant_results(df):
    # Remove values where p-value >= 0.05
    df = df[df['p-value'] < 0.05]
    #Remove values where Lower CI <= 1 & Upper CI >= 1
    mask = ~( (df['Lower CI'] <= 1) & (df['Upper CI'] >= 1) ) 
    df = df[mask]
    return df

## Breed

In [230]:
print(df_breed.head())

                   breed  Odds Ratio   Lower CI     Upper CI       p-value  \
0  Glen Of Imaal Terrier  256.399208  32.689883  2011.036716  1.304411e-07   
1             Otterhound   86.180912  21.043280   352.946385  5.819319e-10   
2          Cesky Terrier   71.766493   9.864233   522.131759  2.436777e-05   
3         Hungarian Puli   44.941973  11.064667   182.543307  1.030376e-07   
4   Bouvier Des Flandres   24.812591   3.455204   178.184750  1.409935e-03   

   case  control  
0     1       10  
1     2       89  
2     1       63  
3     2      155  
4     1      152  


In [231]:
print(len(df_breed))

109


In [7]:
df_breed_sig = remove_inisgnigicant_results(df_breed)

In [233]:
print(len(df_breed_sig))

57


In [234]:
fig5 = create_forestplot(df_breed_sig, "breed", "Breed", "Odds Ratio", "Pseudomonas Otitis Risk by Breed (reference=Crossbreed)")
pio.write_image(fig5, "all_breed_sig_results_forest_linear_scale.png")

In [10]:
df_breed_no_outliers_sig = remove_inisgnigicant_results(df_breed_no_outliers)

In [13]:
fig6 = create_forestplot(df_breed_no_outliers_sig, "breed", "Breed", "Odds Ratio", "Pseudomonas Otitis Risk by Breed (reference=Crossbreed)", pixels=12)
pio.write_image(fig6, "breed_outliers_removed_sig_results_forest_linear_scale.png")