In [44]:
#Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st


#data files
mouse_metadata_path = "Pymaceuticals/data/Mouse_metadata.csv"
study_results_path = "Pymaceuticals/data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_study_complete_results = pd.merge(mouse_metadata, study_results, on=["Mouse ID"])
# Display the data table for preview
mouse_study_complete_results


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumour Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [45]:
# Checking the number of mice.
mouse_ID_count = mouse_study_complete_results["Mouse ID"].count()
print(mouse_ID_count)

unique_mouse_count = mouse_metadata["Mouse ID"].count()
unique_mouse_count

1893


249

In [46]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
#find duplicates
#mouse_duplicates = mouse_study_complete_results.duplicated(subset=['Mouse ID', 'Timepoint'])
mouse_duplicates = mouse_study_complete_results[mouse_study_complete_results[['Mouse ID', 'Timepoint']].duplicated() == True]
mouse_duplicates


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumour Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [59]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_clean_data = mouse_study_complete_results.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep='last')

# Checking the number of mice in the clean DataFrame.
unique_mouse_clean_data = mouse_clean_data['Mouse ID'].count()
unique_mouse_clean_data

1888

## Summary Statistics

In [86]:
# Generate a summary statistics table of mean of the tumour volume for each regimen
drug_regimen_mean = mouse_clean_data.groupby('Drug Regimen')['Tumour Volume (mm3)'].mean()
drug_regimen_mean

Drug Regimen
Capomulin    40.675741
Ceftamin     52.591172
Infubinol    52.884795
Ketapril     55.235638
Naftisol     54.331565
Placebo      54.033581
Propriva     52.382993
Ramicane     40.216745
Stelasyn     54.233149
Zoniferol    53.236507
Name: Tumour Volume (mm3), dtype: float64

In [87]:
# Generate a summary statistics table of variance of the tumour volume for each regimen
drug_regimen_variance = mouse_clean_data.groupby('Drug Regimen')['Tumour Volume (mm3)'].var()
drug_regimen_variance

Drug Regimen
Capomulin    24.947764
Ceftamin     39.290177
Infubinol    43.128684
Ketapril     68.553577
Naftisol     66.173479
Placebo      61.168083
Propriva     43.220205
Ramicane     23.486704
Stelasyn     59.450562
Zoniferol    48.533355
Name: Tumour Volume (mm3), dtype: float64

In [88]:
# Generate a summary statistics table of median of the tumour volume for each regimen
drug_regimen_median = mouse_clean_data.groupby('Drug Regimen')['Tumour Volume (mm3)'].median()
drug_regimen_median

Drug Regimen
Capomulin    41.557809
Ceftamin     51.776157
Infubinol    51.820584
Ketapril     53.698743
Naftisol     52.509285
Placebo      52.288934
Propriva     50.783528
Ramicane     40.673236
Stelasyn     52.431737
Zoniferol    51.818479
Name: Tumour Volume (mm3), dtype: float64

In [89]:
# Generate a summary statistics table of standard deviation of the tumour volume for each regimen
drug_regimen_std_dev = mouse_clean_data.groupby('Drug Regimen')['Tumour Volume (mm3)'].std()
drug_regimen_std_dev

Drug Regimen
Capomulin    4.994774
Ceftamin     6.268188
Infubinol    6.567243
Ketapril     8.279709
Naftisol     8.134708
Placebo      7.821003
Propriva     6.574208
Ramicane     4.846308
Stelasyn     7.710419
Zoniferol    6.966589
Name: Tumour Volume (mm3), dtype: float64

In [85]:
# Generate a summary statistics table for SEM of the tumour volume for each regimen
drug_regimen_sem = mouse_clean_data.groupby('Drug Regimen')['Tumour Volume (mm3)'].sem()
drug_regimen_sem

Drug Regimen
Capomulin    0.329346
Ceftamin     0.469821
Infubinol    0.492236
Ketapril     0.603860
Naftisol     0.596466
Placebo      0.581331
Propriva     0.526358
Ramicane     0.320955
Stelasyn     0.573111
Zoniferol    0.516398
Name: Tumour Volume (mm3), dtype: float64

In [105]:
# Assemble the resulting series into a single summary dataframe.
# Combine the mean and median data into a single dataset
tumour_volume_combined = pd.merge(drug_regimen_mean, drug_regimen_median, how='outer', on=["Drug Regimen"])

#Rename columns x to mean and y to median
tumour_volume_combined = tumour_volume_combined.rename(columns={"Tumour Volume (mm3)_x":"Tumour Volume (mm3) mean",
                                                            "Tumour Volume (mm3)_y":"Tumour Volume (mm3) median"})

# Combine the variance into the combined dataset
tumour_volume_combined = pd.merge(tumour_volume_combined, drug_regimen_variance, how='outer', on=["Drug Regimen"])

# Combine the std dev into the combined dataset
tumour_volume_combined = pd.merge(tumour_volume_combined, drug_regimen_std_dev, how='outer', on=["Drug Regimen"])

#Rename columns x to variance and y to std dev
tumour_volume_combined = tumour_volume_combined.rename(columns={"Tumour Volume (mm3)_x":"Tumour Volume (mm3) variance",
                                                            "Tumour Volume (mm3)_y":"Tumour Volume (mm3) std dev"})

# Combine the sem into the combined dataset
tumour_volume_combined = pd.merge(tumour_volume_combined, drug_regimen_sem, how='outer', on=["Drug Regimen"])

#Rename columns x to SEM
tumour_volume_combined = tumour_volume_combined.rename(columns={"Tumour Volume (mm3)":"Tumour Volume (mm3) SEM"})
tumour_volume_combined


Unnamed: 0_level_0,Tumour Volume (mm3) mean,Tumour Volume (mm3) median,Tumour Volume (mm3) variance,Tumour Volume (mm3) std dev,Tumour Volume (mm3) SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.382993,50.783528,43.220205,6.574208,0.526358
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [131]:
# Using the aggregation method, produce the same summary statistics in a single line
tvc_agg = tumour_volume_combined[['Tumour Volume (mm3) mean','Tumour Volume (mm3) median','Tumour Volume (mm3) variance',
                                   'Tumour Volume (mm3) std dev','Tumour Volume (mm3) SEM']].agg('sum')
tvc_agg




Tumour Volume (mm3) mean        509.821885
Tumour Volume (mm3) median      499.358491
Tumour Volume (mm3) variance    477.952590
Tumour Volume (mm3) std dev      68.163148
Tumour Volume (mm3) SEM           5.009879
dtype: float64

In [175]:
# tvc_agg = tumour_volume_combined.groupby('Drug Regimen')[['Tumour Volume (mm3) mean','Tumour Volume (mm3) median',
#                                     'Tumour Volume (mm3) median','Tumour Volume (mm3) variance',
#                                     'Tumour Volume (mm3) std dev','Tumour Volume (mm3) SEM']].agg('sum')

# tvc_agg = tumour_volume_combined['Tumour Volume (mm3) mean'].agg()
#tvc_agg

tvc_agg_mean = tumour_volume_combined['Tumour Volume (mm3) mean'].agg('mean')
tvc_agg_median = tumour_volume_combined['Tumour Volume (mm3) median'].agg('median')
tvc_agg_var = tumour_volume_combined['Tumour Volume (mm3) variance'].agg('var')
tvc_agg_stddev = tumour_volume_combined['Tumour Volume (mm3) std dev'].agg('std')
tvc_agg_sem = tumour_volume_combined['Tumour Volume (mm3) SEM'].agg('sem')




0.03251623403082617