In [1]:
# Dependencies and setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
# Combine the data into a single dataset
single_dataset = pd.merge (mouse_metadata, study_results, how = "outer", on="Mouse ID")

# Display the data table for preview
display(single_dataset)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [3]:
 # Checking the number of mice.
unique_id = len(single_dataset["Mouse ID"].unique())
print("Number of mice: " + str(unique_id))


Number of mice: 249


In [4]:
 # Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
 # Comment: Eliminating duplicate Mice ID and Timepoint combo does not alter the number of mice.
non_duplicates = single_dataset.drop_duplicates(subset = ['Mouse ID', 'Timepoint'], keep = False).reset_index(drop = True)
non_duplicates

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1878,z969,Naftisol,Male,9,30,25,63.145652,2
1879,z969,Naftisol,Male,9,30,30,65.841013,3
1880,z969,Naftisol,Male,9,30,35,69.176246,4
1881,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Optional: Get all the data for the duplicate mouse ID.
the_duplicates = single_dataset[single_dataset[['Mouse ID', 'Timepoint']].duplicated(keep = False) == True]
the_duplicates

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [23]:
# Generate a summary statistics table consisting of the mean, median, variance, standard deviation, 
# and SEM of the tumor volume for *each drug regimen*.
# Data frame is non_duplicates
tv_mean = non_duplicates.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"]
# NOT NECESSARY ANYMORE Drop extra columns df.drop(['B', 'C'], axis=1)
# Rename Column
tv_mean = mean_dr.rename(columns={"Tumor Volume (mm3)":"Mean"})
# Same for the rest of statistics
# Median
tv_median = non_duplicates.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"]
tv_median = tv_median.reset_index()
tv_median = tv_median.rename(columns={"Tumor Volume (mm3)":"Median"})
# Variance
tv_var = non_duplicates.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"]
tv_var = tv_var.reset_index()
tv_var = tv_var.rename(columns={"Tumor Volume (mm3)":"Variance"})
# Standard Deviation
tv_std = non_duplicates.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"]
tv_std = tv_std.reset_index()
tv_std = tv_std.rename(columns={"Tumor Volume (mm3)":"St Dev"})
# SEM
tv_sem = non_duplicates.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"]
tv_sem = tv_sem.reset_index()
tv_sem = tv_sem.rename(columns={"Tumor Volume (mm3)":"SEM"})
# Integrate the table
sum_stats = [tv_mean,tv_median,tv_var,tv_std,tv_sem]

Unnamed: 0,Drug Regimen,SEM
0,Capomulin,0.329346
1,Ceftamin,0.469821
2,Infubinol,0.492236
3,Ketapril,0.60386
4,Naftisol,0.596466
5,Placebo,0.581331
6,Propriva,0.540135
7,Ramicane,0.320955
8,Stelasyn,0.573111
9,Zoniferol,0.516398
