## Observations and Insights

## Dependencies and starter code

In [35]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np


In [36]:
# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_df = pd.read_csv(mouse_metadata)
study_df = pd.read_csv(study_results)


# Combine the data into a single dataset
merged_df = pd.merge(mouse_df, study_df, on=('Mouse ID'))
merged_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [37]:
merged_df.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,1893.0,1893.0,1893.0,1893.0,1893.0
mean,12.81458,25.662441,19.572108,50.448381,1.021659
std,7.189592,3.921622,14.07946,8.894722,1.137974
min,1.0,15.0,0.0,22.050126,0.0
25%,7.0,25.0,5.0,45.0,0.0
50%,13.0,27.0,20.0,48.951474,1.0
75%,20.0,29.0,30.0,56.2922,2.0
max,24.0,30.0,45.0,78.567014,4.0


## Summary statistics

In [38]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
group_by_drug_tp = merged_df.groupby(["Drug Regimen", "Timepoint"])
mean_tumor_volume = (group_by_drug_tp["Tumor Volume (mm3)"]).mean().reset_index()
mean_tumor_volume_df = pd.DataFrame(mean_tumor_volume)
mean_tumor_volume_df.head(5)


Unnamed: 0,Drug Regimen,Timepoint,Tumor Volume (mm3)
0,Capomulin,0,45.0
1,Capomulin,5,44.266086
2,Capomulin,10,43.084291
3,Capomulin,15,42.064317
4,Capomulin,20,40.716325


In [39]:
stand_deviation_drug_timepoint = merged_df.groupby(["Drug Regimen", "Timepoint"])
stand_deviation_tumor = (stand_deviation_drug_timepoint["Tumor Volume (mm3)"]).sem().reset_index()

# Convert to DataFrame
stand_deviation_tumor_df = pd.DataFrame(stand_deviation_tumor)
                                
# Preview DataFrame
stand_deviation_tumor_df.head(5)

Unnamed: 0,Drug Regimen,Timepoint,Tumor Volume (mm3)
0,Capomulin,0,0.0
1,Capomulin,5,0.448593
2,Capomulin,10,0.702684
3,Capomulin,15,0.838617
4,Capomulin,20,0.909731


In [40]:
group_by_drug_tp = merged_df.groupby(["Drug Regimen", "Timepoint"])
median_tumor_volume = (group_by_drug_tp["Tumor Volume (mm3)"]).median().reset_index()
median_tumor_volume_df = pd.DataFrame(median_tumor_volume)
median_tumor_volume_df.head(5)

Unnamed: 0,Drug Regimen,Timepoint,Tumor Volume (mm3)
0,Capomulin,0,45.0
1,Capomulin,5,45.597064
2,Capomulin,10,43.421014
3,Capomulin,15,42.79816
4,Capomulin,20,40.716428


## Bar plots

In [41]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas



In [47]:
pivot_table = mean_tumor_volume_df.pivot(index ="Timepoint", columns = 'Drug Regimen', values = "Tumor Volume (mm3)")
pivot_table.reset_index(level = None, inplace = True)

#Preview the formatted table
pivot_table.head()
table_fourdrugs = pivot_table[["Timepoint", "Capomulin", "Infubinol", "Ketapril", "Placebo"]]
table_fourdrugs.head()

Drug Regimen,Timepoint,Capomulin,Infubinol,Ketapril,Placebo
0,0,45.0,45.0,45.0,45.0
1,5,44.266086,46.541247,47.389175,47.125589
2,10,43.084291,49.403909,49.582269,49.423329
3,15,42.064317,51.296397,52.399974,51.359742
4,20,40.716325,53.197691,54.920935,54.364417


## Pie plots

In [5]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [6]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, outliers and boxplots

In [7]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [8]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [9]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [10]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [11]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen