## Observations and Insights 

In [50]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata_df = pd.read_csv(mouse_metadata_path)
study_results_df = pd.read_csv(study_results_path)

# Combine the data into a single dataset
pymaceutical_df = pd.merge(mouse_metadata_df, study_results_df, how="left", on=["Mouse ID", "Mouse ID"])
# Display the data table for preview


In [51]:
pymaceutical_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1893 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Mouse ID             1893 non-null   object 
 1   Drug Regimen         1893 non-null   object 
 2   Sex                  1893 non-null   object 
 3   Age_months           1893 non-null   int64  
 4   Weight (g)           1893 non-null   int64  
 5   Timepoint            1893 non-null   int64  
 6   Tumour Volume (mm3)  1893 non-null   float64
 7   Metastatic Sites     1893 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 133.1+ KB


In [52]:
pymaceutical_df.columns

Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumour Volume (mm3)', 'Metastatic Sites'],
      dtype='object')

In [53]:
mice_count = len(pymaceutical_df["Mouse ID"].unique())
mice_count

249

In [54]:
pymaceutical_df["Mouse ID"].nunique()

249

In [55]:
# Checking the number of mice.
pymaceutical_df["Mouse ID"]

0       k403
1       k403
2       k403
3       k403
4       k403
        ... 
1888    z969
1889    z969
1890    z969
1891    z969
1892    z969
Name: Mouse ID, Length: 1893, dtype: object

In [56]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
solo_duplicated_df=pymaceutical_df.loc[pymaceutical_df.duplicated(subset=["Mouse ID", "Timepoint"])]
solo_duplicated_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumour Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [57]:
pymaceutical_df.loc[pymaceutical_df.duplicated(subset=["Mouse ID", "Timepoint"]),"Mouse ID" ].unique()

array(['g989'], dtype=object)

In [58]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicated_data_df=pymaceutical_df.loc[pymaceutical_df["Mouse ID"] == "g989" ]
duplicated_data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumour Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [59]:
new_duplicated_data_df=pymaceutical_df.drop_duplicates()
new_duplicated_data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumour Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [60]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
new_duplicated_data_subset_df=pymaceutical_df.drop_duplicates(subset=["Mouse ID", "Timepoint"])
new_duplicated_data_subset_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumour Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [61]:
# Checking the number of mice in the clean DataFrame.
new_duplicated_data_subset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1888 entries, 0 to 1892
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Mouse ID             1888 non-null   object 
 1   Drug Regimen         1888 non-null   object 
 2   Sex                  1888 non-null   object 
 3   Age_months           1888 non-null   int64  
 4   Weight (g)           1888 non-null   int64  
 5   Timepoint            1888 non-null   int64  
 6   Tumour Volume (mm3)  1888 non-null   float64
 7   Metastatic Sites     1888 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 132.8+ KB


In [62]:
new_duplicated_data_subset_df["Mouse ID"].nunique()

249

## Summary Statistics

In [63]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumour volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumour volume. 
# Assemble the resulting series into a single summary dataframe.

In [64]:
pymaceutical_df.columns

Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumour Volume (mm3)', 'Metastatic Sites'],
      dtype='object')

In [65]:
pymaceutical_df["Drug Regimen"].unique()

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [66]:
summary_statistic_start_df = pymaceutical_df[["Drug Regimen","Tumour Volume (mm3)"]]
summary_statistic_start_df.head()

Unnamed: 0,Drug Regimen,Tumour Volume (mm3)
0,Ramicane,45.0
1,Ramicane,38.825898
2,Ramicane,35.014271
3,Ramicane,34.223992
4,Ramicane,32.997729


In [67]:
summary_statistics_group_df = summary_statistic_start_df.groupby(["Drug Regimen"])
summary_statistics_group_df

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fa990fa2210>

In [78]:
mean_df = summary_statistics_group_df.mean()
mean_df = mean_df.rename(columns={"Tumour Volume (mm3)":"Mean - Tumour Volume (mm3)"})
mean_df

Unnamed: 0_level_0,Mean - Tumour Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,40.675741
Ceftamin,52.591172
Infubinol,52.884795
Ketapril,55.235638
Naftisol,54.331565
Placebo,54.033581
Propriva,52.322552
Ramicane,40.216745
Stelasyn,54.233149
Zoniferol,53.236507


In [79]:
median_df =summary_statistics_group_df.median()
median_df = median_df.rename(columns={"Tumour Volume (mm3)":"Median - Tumour Volume (mm3)"})
median_df

Unnamed: 0_level_0,Median - Tumour Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,41.557809
Ceftamin,51.776157
Infubinol,51.820584
Ketapril,53.698743
Naftisol,52.509285
Placebo,52.288934
Propriva,50.854632
Ramicane,40.673236
Stelasyn,52.431737
Zoniferol,51.818479


In [70]:
variance_df = summary_statistics_group_df.var()
variance_df = variance_df.rename(columns={"Tumour Volume (mm3)":"Variance - Tumour Volume (mm3)"})
variance_df

Unnamed: 0_level_0,Tumour Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,24.947764
Ceftamin,39.290177
Infubinol,43.128684
Ketapril,68.553577
Naftisol,66.173479
Placebo,61.168083
Propriva,42.35107
Ramicane,23.486704
Stelasyn,59.450562
Zoniferol,48.533355


In [80]:
std_df = summary_statistics_group_df.std()
std_df = std_df.rename(columns={"Tumour Volume (mm3)":"Standard Deviation - Tumour Volume (mm3)"})
std_df

Unnamed: 0_level_0,Standard Deviation - Tumour Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,4.994774
Ceftamin,6.268188
Infubinol,6.567243
Ketapril,8.279709
Naftisol,8.134708
Placebo,7.821003
Propriva,6.50777
Ramicane,4.846308
Stelasyn,7.710419
Zoniferol,6.966589


In [81]:
SEM_df = summary_statistics_group_df.sem()
SEM_df = SEM_df.rename(columns={"Tumour Volume (mm3)":"SEM - Tumour Volume (mm3)"})
SEM_df

Unnamed: 0_level_0,SEM - Tumour Volume (mm3)
Drug Regimen,Unnamed: 1_level_1
Capomulin,0.329346
Ceftamin,0.469821
Infubinol,0.492236
Ketapril,0.60386
Naftisol,0.596466
Placebo,0.581331
Propriva,0.512884
Ramicane,0.320955
Stelasyn,0.573111
Zoniferol,0.516398


In [None]:
summary_statistics_df = pd.DataFrame ({"Mean - Tumour Volume (mm3)":mean_df("Mean - Tumour Volume (mm3)"), "Total Students":[total_number_of_students], "Total Budget":[total_budget], "Average Maths Score":[average_maths_score], "Average Reading Score":[average_reading_score], "% Passing Maths":[Percent_Passing_maths], "% Passing Reading":[Percent_Passing_reading], "% Overall Passing":[total_passing_maths_reading]})
summary_statistics_df

In [85]:
summary_statistics_df = pd.merge(mean_df, median_df,variance_df,std_df,SEM_df, how="outer", on=["Drug Regimen"])
summary_statistics_df

TypeError: merge() got multiple values for argument 'how'

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumour volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.



In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumour volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumour vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumour volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumour volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumour volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumour volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumour volume for the Capomulin regimen
