## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
metadata_results = pd.merge(mouse_metadata,study_results)
metadata_results

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [2]:
# Checking the number of mice in the DataFrame.
total_mice = metadata_results['Mouse ID'].nunique()
total_mice

249

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicates_data = metadata_results.duplicated(subset=['Mouse ID','Timepoint'])
duplicates_data

0       False
1       False
2       False
3       False
4       False
        ...  
1888    False
1889    False
1890    False
1891    False
1892    False
Length: 1893, dtype: bool

In [4]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_duplicate = mouse_metadata['Mouse ID'].duplicated()
mouse_duplicate

0      False
1      False
2      False
3      False
4      False
       ...  
244    False
245    False
246    False
247    False
248    False
Name: Mouse ID, Length: 249, dtype: bool

In [9]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data = duplicates_data.duplicated(keep=False)
clean_data

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [7]:
# Checking the number of mice in the clean DataFrame.
clean_data.nunique()

1

## Summary Statistics

In [26]:
metadata_results['Drug Regimen'].unique()

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [27]:
regimen = metadata_results.groupby('Drug Regimen')

In [36]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_stats = regimen['Tumor Volume (mm3)'].describe().round(decimals=1)
summary_stats
# This method is the most straightforward, creating multiple series and putting them all together at the end.

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Capomulin,230.0,40.7,5.0,23.3,37.7,41.6,45.0,48.2
Ceftamin,178.0,52.6,6.3,45.0,47.2,51.8,56.8,68.9
Infubinol,178.0,52.9,6.6,36.3,47.3,51.8,57.3,72.2
Ketapril,188.0,55.2,8.3,45.0,48.2,53.7,60.9,78.6
Naftisol,186.0,54.3,8.1,45.0,47.3,52.5,60.0,76.7
Placebo,181.0,54.0,7.8,45.0,47.5,52.3,59.9,73.2
Propriva,161.0,52.3,6.5,45.0,47.1,50.9,56.2,72.5
Ramicane,228.0,40.2,4.8,22.1,36.7,40.7,45.0,47.6
Stelasyn,181.0,54.2,7.7,45.0,48.0,52.4,58.7,75.1
Zoniferol,182.0,53.2,7.0,45.0,47.3,51.8,58.0,73.3


## Bar Plots

In [9]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [10]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [14]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [15]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [16]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [17]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
