## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
Combined_data= pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")

# Display the data table for preview
Combined_data.head()





Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [2]:
# Checking the number of mice.
mice=Combined_data["Mouse ID"].value_counts()
number_of_mice=len(mice)
number_of_mice

249

In [3]:
mice=Combined_data.value_counts(["Mouse ID",'Timepoint'])
print(mice)

Mouse ID  Timepoint
g989      15           2
          0            2
          10           2
          5            2
          20           2
                      ..
q610      30           1
          35           1
q633      0            1
          5            1
a203      0            1
Length: 1888, dtype: int64


In [53]:
mice=Combined_data.index.value_counts()
print(mice)



g989    13
s337    10
x401    10
m550    10
x773    10
        ..
l872     1
n482     1
d133     1
v199     1
x336     1
Name: Mouse ID, Length: 249, dtype: int64


In [58]:
mice=Combined_data.groupby(["Mouse ID", "Timepoint"]).count()
#print(duplicated_mice.duplicated(["Timepoint"]).value_counts())
#print(mice.value_counts(ascending=False))
print(mice)

                    Tumor Volume (mm3)  Metastatic Sites  Drug Regimen  Sex  \
Mouse ID Timepoint                                                            
a203     0                           1                 1             1    1   
         5                           1                 1             1    1   
         10                          1                 1             1    1   
         15                          1                 1             1    1   
         20                          1                 1             1    1   
...                                ...               ...           ...  ...   
z969     25                          1                 1             1    1   
         30                          1                 1             1    1   
         35                          1                 1             1    1   
         40                          1                 1             1    1   
         45                          1              

In [59]:
#duplicate mice
duplicate_mice=Combined_data[Combined_data.duplicated(['Timepoint'])]
print(duplicate_mice.value_counts())

Timepoint  Tumor Volume (mm3)  Metastatic Sites  Drug Regimen  Sex     Age_months  Weight (g)
0          45.000000           0                 Infubinol     Male    23          26            3
                                                 Placebo       Male    5           30            2
                                                 Zoniferol     Male    5           30            2
                                                 Ramicane      Male    11          16            2
                                                 Placebo       Female  16          25            2
                                                                                                ..
25         58.098489           0                 Stelasyn      Male    3           30            1
           58.214624           3                 Placebo       Female  13          26            1
           58.269889           2                 Stelasyn      Female  1           27            1
           58.3

In [None]:
duplicated=Combined_data[Combined_data.duplicated(['Mouse ID', 'Timepoint'],keep=False)]
print(duplicated["Mouse ID"].value_counts())
print(Combined_data["Mouse ID"].value_counts())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_rows = Combined_data[Combined_data.duplicated(['Mouse ID', 'Timepoint'])]
print(duplicate_rows.value_counts())



In [18]:
#print(Combined_data.head())
print(Combined_data.loc["g989"])
#Combined_data=Combined_data.set_index("Mouse ID")
#duplicate_mice=pd.DataFrame(Combined_data.groupby(["Mouse ID"]))
#print(duplicate_mice.head())

          Timepoint  Tumor Volume (mm3)  Metastatic Sites Drug Regimen  \
Mouse ID                                                                 
g989              0           45.000000                 0     Propriva   
g989              0           45.000000                 0     Propriva   
g989              5           48.786801                 0     Propriva   
g989              5           47.570392                 0     Propriva   
g989             10           51.745156                 0     Propriva   
g989             10           49.880528                 0     Propriva   
g989             15           51.325852                 1     Propriva   
g989             15           53.442020                 0     Propriva   
g989             20           55.326122                 1     Propriva   
g989             20           54.657650                 1     Propriva   
g989             25           56.045564                 1     Propriva   
g989             30           59.08229

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 



In [48]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df_indices=Combined_data.index.duplicated(keep="first")
print (Combined_data.index)
print (clean_df_indices)
print(~ clean_df_indices)

clean_df=Combined_data[~clean_df_indices]
print (clean_df)
print (len(clean_df))

Index(['b128', 'f932', 'g107', 'a457', 'c819', 'h246', 'p189', 'n923', 'q119',
       'f993',
       ...
       't718', 'o562', 'i177', 'v991', 'z795', 'r944', 'u364', 'p438', 'x773',
       'b879'],
      dtype='object', name='Mouse ID', length=1893)
[False False False ...  True  True  True]
[ True  True  True ... False False False]
          Timepoint  Tumor Volume (mm3)  Metastatic Sites Drug Regimen  \
Mouse ID                                                                 
b128              0                45.0                 0    Capomulin   
f932              0                45.0                 0     Ketapril   
g107              0                45.0                 0     Ketapril   
a457              0                45.0                 0     Ketapril   
c819              0                45.0                 0     Ketapril   
...             ...                 ...               ...          ...   
t565              0                45.0                 0    Capomulin  

In [64]:
print (clean_df_indices)
clean_df=Combined_data[clean_df_indices]
print(len(Combined_data.loc["g989"]))

[False False False ...  True  True  True]
13


In [50]:
# Checking the number of mice in the clean DataFrame.
#Combined_data.count_values()
#print(Combined_data.count_values())
print (clean_df.nunique())
print(clean_df.index)
print(clean_df.loc["g989"])

Timepoint              1
Tumor Volume (mm3)     1
Metastatic Sites       1
Drug Regimen          10
Sex                    2
Age_months            24
Weight (g)            16
dtype: int64
Index(['b128', 'f932', 'g107', 'a457', 'c819', 'h246', 'p189', 'n923', 'q119',
       'f993',
       ...
       'y793', 'g316', 'g288', 'l897', 'w150', 't565', 'i557', 'm957', 'f966',
       'm601'],
      dtype='object', name='Mouse ID', length=249)
Timepoint                    0
Tumor Volume (mm3)          45
Metastatic Sites             0
Drug Regimen          Propriva
Sex                     Female
Age_months                  21
Weight (g)                  26
Name: g989, dtype: object


## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
