## Observations and Insights 

In [468]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
print(mouse_metadata)
print(study_results)
mouse_study_df = pd.merge(study_results, mouse_metadata, on="Mouse ID")
mouse_study_df

Mouse ID Drug Regimen     Sex  Age_months  Weight (g)
0       k403     Ramicane    Male          21          16
1       s185    Capomulin  Female           3          17
2       x401    Capomulin  Female          16          15
3       m601    Capomulin    Male          22          17
4       g791     Ramicane    Male          11          16
..       ...          ...     ...         ...         ...
244     z314     Stelasyn  Female          21          28
245     z435     Propriva  Female          12          26
246     z581    Infubinol  Female          24          25
247     z795     Naftisol  Female          13          29
248     z969     Naftisol    Male           9          30

[249 rows x 5 columns]
     Mouse ID  Timepoint  Tumor Volume (mm3)  Metastatic Sites
0        b128          0           45.000000                 0
1        f932          0           45.000000                 0
2        g107          0           45.000000                 0
3        a457          0        

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [469]:
#### Checking the number of mice in the DataFrame.
num_mice_ID = mouse_study_df['Mouse ID'].nunique()
num_mice_ID

249

In [470]:
#### Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

#Make a new field mouse_tp which combines Mouse_ID + Timepoint. This should be unique since the same mouse can only be observed once at a particular timepoint. If not unique, something is wrong with mouse's data so remove all records for that mouse.
mouse_study_df['mouse_tp'] = mouse_study_df['Mouse ID'] + mouse_study_df['Timepoint'].astype(str)

#Creates table of True (is a duplicate mouse_tp ID) or False (is not a duplicate mouse_tp ID)
duplicate_mouse_tp = pd.DataFrame(mouse_study_df['mouse_tp'].value_counts()>1)

#Change column name to be descriptive of what True/False indicates in this column
duplicate_mouse_tp.rename(columns = {'mouse_tp':'duplicate_mouse'}, inplace = True)

#Extract the Mouse ID from the mouse_tp and set the index to Mouse ID
duplicate_mouse_tp['Mouse ID']= duplicate_mouse_tp.index.str[:4]
duplicate_mouse_tp.set_index('Mouse ID', inplace=True)

#Remove all the rows with duplicate ID. Result is a table of unique Mouse ID's and whether each ID is duplicated and needs to be removed (TRUE) or not duplicated and shoudl be kept (FALSE)
index = duplicate_mouse_tp.index
is_duplicate = index.duplicated(keep="first")
not_duplicate = ~is_duplicate
dup_mouse_truth = duplicate_mouse_tp[not_duplicate]

#Show duplicate Mouse ID's which should be removed
dup_IDs = dup_mouse_truth.loc[dup_mouse_truth['duplicate_mouse']]

dup_IDs

Unnamed: 0_level_0,duplicate_mouse
Mouse ID,Unnamed: 1_level_1
g989,True


In [471]:
#### Optional: Get all the data for the duplicate mouse ID. 

#Add duplicate column (TRUE) to full mouse study database
mouse_study_dup_flag_df = pd.merge(mouse_study_df, dup_mouse_truth, on='Mouse ID')

#Show records for duplicate Mouse ID's
dup_mice_df = mouse_study_dup_flag_df.loc[mouse_study_dup_flag_df['duplicate_mouse']]
dup_mice_df

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g),mouse_tp,duplicate_mouse
860,g989,0,45.0,0,Propriva,Female,21,26,g9890,True
861,g989,0,45.0,0,Propriva,Female,21,26,g9890,True
862,g989,5,48.786801,0,Propriva,Female,21,26,g9895,True
863,g989,5,47.570392,0,Propriva,Female,21,26,g9895,True
864,g989,10,51.745156,0,Propriva,Female,21,26,g98910,True
865,g989,10,49.880528,0,Propriva,Female,21,26,g98910,True
866,g989,15,51.325852,1,Propriva,Female,21,26,g98915,True
867,g989,15,53.44202,0,Propriva,Female,21,26,g98915,True
868,g989,20,55.326122,1,Propriva,Female,21,26,g98920,True
869,g989,20,54.65765,1,Propriva,Female,21,26,g98920,True


In [472]:
#### Create a clean DataFrame by dropping the duplicate mouse by its ID.

#Make new mouse study df with duplicate mouse ID's removed. Clean up by deleting the working columns added during these steps.
no_dups_mouse_study_df = mouse_study_dup_flag_df.loc[mouse_study_dup_flag_df['duplicate_mouse']==False]
no_dups_mouse_study_df.drop(['mouse_tp', 'duplicate_mouse'], axis=1, inplace=True)

no_dups_mouse_study_df

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22
...,...,...,...,...,...,...,...,...
1888,m601,25,33.118756,1,Capomulin,Male,22,17
1889,m601,30,31.758275,1,Capomulin,Male,22,17
1890,m601,35,30.834357,1,Capomulin,Male,22,17
1891,m601,40,31.378045,1,Capomulin,Male,22,17


In [485]:
#### Checking the number of mice in the clean DataFrame.
no_dup_num_mice_ID = no_dups_mouse_study_df['Mouse ID'].nunique()
no_dup_num_mice_ID

248

## Summary Statistics

In [474]:
#### Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

#### This method is the most straighforward, creating multiple series and putting them all together at the end.

In [475]:
#### Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

## Bar Plots

In [476]:
#### Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas. 

In [477]:
#### Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [478]:
#### Generate a pie plot showing the distribution of female versus male mice using pandas

In [479]:
####Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [480]:
#### Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [481]:
#### Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [482]:
#### Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [483]:
#### Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [484]:
#### Calculate the correlation coefficient and linear regression model 
#### for mouse weight and average tumor volume for the Capomulin regimen
