## Observations and Insights 

In [30]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

#sanity check
#print(mouse_metadata)
#print(study_results)

# Combine the data into a single dataset
merged_data = pd.merge(mouse_metadata, study_results, how="outer", on="Mouse ID")

#sanity check
merged_data.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [31]:
# Checking the number of mice in the DataFrame.
len(merged_data)

1893

In [32]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

In [33]:
# Optional: Get all the data for the duplicate mouse ID.

In [126]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merged_data.drop_duplicates("Mouse ID", keep = "last")

#sorting by Drug Regimen
sorted_clean_df = clean_df.sort_values(by = "Drug Regimen")

#sanity check
#sorted_clean_df

In [35]:
# Checking the number of mice in the clean DataFrame.
len(clean_df)

249

In [130]:
#visualize a df grouped by regimen
organized_no_count_df = clean_df.groupby("Drug Regimen")

organized_w_count_df = organized_no_count_df.count()

#sanity check
#organized_w_count_df

#checking the types
#organized_w_count_df.dtypes

In [128]:
#pulling the drug regimen names
drug_regimen = list(organized_w_count_df.index.values)

#sanity check
#drug_regimen

In [194]:
#df's with the seperate necessary calculations
mean_df = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
median_df = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
var_df = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()

#sanity check
print(mean_df)

#sanity check
print(median_df)

#sanity check
print(var_df)

Drug Regimen
Capomulin    36.667568
Ceftamin     57.753977
Infubinol    58.178246
Ketapril     62.806191
Naftisol     61.205757
Placebo      60.508414
Propriva     56.736964
Ramicane     36.191390
Stelasyn     61.001707
Zoniferol    59.181258
Name: Tumor Volume (mm3), dtype: float64
Drug Regimen
Capomulin    38.125164
Ceftamin     59.851956
Infubinol    60.165180
Ketapril     64.487812
Naftisol     63.283288
Placebo      62.030594
Propriva     55.841410
Ramicane     36.561652
Stelasyn     62.192350
Zoniferol    61.840058
Name: Tumor Volume (mm3), dtype: float64
Drug Regimen
Capomulin     32.663378
Ceftamin      69.982735
Infubinol     74.010875
Ketapril      98.921330
Naftisol     106.029927
Placebo       78.759797
Propriva      69.349002
Ramicane      32.166354
Stelasyn      90.331586
Zoniferol     76.862027
Name: Tumor Volume (mm3), dtype: float64


## Summary Statistics

In [198]:
#try to merge the calculations you have so far
first_merge_df = pd.merge(mean_df, median_df, on="Drug Regimen")

#adding variance to the df
second_merge_df = pd.merge(first_merge_df, var_df, on='Drug Regimen')

#renaming the columns to reflect the claculations we have so far
second_merge_df = second_merge_df.rename(columns={"Tumor Volume (mm3)_x":"Avg. Tumor Vol.","Tumor Volume (mm3)_y":"Median Tumor Vol.","Tumor Volume (mm3)":"Variance"})

#sanity check
second_merge_df

Unnamed: 0_level_0,Avg. Tumor Vol.,Median Tumor Vol.,Variance
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Capomulin,36.667568,38.125164,32.663378
Ceftamin,57.753977,59.851956,69.982735
Infubinol,58.178246,60.16518,74.010875
Ketapril,62.806191,64.487812,98.92133
Naftisol,61.205757,63.283288,106.029927
Placebo,60.508414,62.030594,78.759797
Propriva,56.736964,55.84141,69.349002
Ramicane,36.19139,36.561652,32.166354
Stelasyn,61.001707,62.19235,90.331586
Zoniferol,59.181258,61.840058,76.862027


In [175]:
#calculate the standard deviation and sem






In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straightforward, creating multiple series and putting them all together at the end.





In [10]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function.

## Bar Plots

In [11]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [12]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [13]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [14]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [15]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [16]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [17]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [18]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [19]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
