## Observations and Insights 

In [92]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_data = pd.merge(study_results, mouse_metadata, on='Mouse ID', how='left')

# Display the data table for preview
#1893 rows
merged_data.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [93]:
# Checking the number of mice.

len(merged_data['Mouse ID'].value_counts())

249

In [94]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
ID_groups = merged_data.loc[merged_data.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
#find duplicates in mouse id & timpoint, print that mouse ID

ID_groups

array(['g989'], dtype=object)

In [95]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicated_data = merged_data.loc[merged_data["Mouse ID"] == 'g989']

duplicated_data

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
107,g989,0,45.0,0,Propriva,Female,21,26
137,g989,0,45.0,0,Propriva,Female,21,26
329,g989,5,48.786801,0,Propriva,Female,21,26
360,g989,5,47.570392,0,Propriva,Female,21,26
620,g989,10,51.745156,0,Propriva,Female,21,26
681,g989,10,49.880528,0,Propriva,Female,21,26
815,g989,15,51.325852,1,Propriva,Female,21,26
869,g989,15,53.44202,0,Propriva,Female,21,26
950,g989,20,55.326122,1,Propriva,Female,21,26
1111,g989,20,54.65765,1,Propriva,Female,21,26


In [96]:
Count mice X
Group by mouse ID X
Use grouped data to search for duplicate timepoints for a given mouse ID
If duplicates are found, delete all rows with that mouse ID


SyntaxError: invalid syntax (<ipython-input-96-c7541bb84eaa>, line 1)

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

cleaned_df = merged_data[merged_data['Mouse ID'].isin(ID_groups) == False]
cleaned_df

In [None]:
# Checking the number of mice in the clean DataFrame.
len(cleaned_df['Mouse ID'].unique())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_group = cleaned_df.groupby(['Drug Regimen'])
# This method is the most straighforward, creating multiple series and putting them all together at the end.

tumor_mean = drug_group.mean()['Tumor Volume (mm3)']
tumor_median = drug_group.median()['Tumor Volume (mm3)']
tumor_variance = drug_group.var()['Tumor Volume (mm3)']
tumor_SD = drug_group.std()['Tumor Volume (mm3)']
tumor_SEM =drug_group.sem()['Tumor Volume (mm3)']

tumor_summary = pd.DataFrame({
    'Tumor Mean' : tumor_mean,
    'Tumor Median' : tumor_median,
    'Tumor Variance' : tumor_variance,
    'Tumor Standard Deviation' : tumor_SD, 
    'Tumor SEM' : tumor_SEM
})

tumor_summary

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_table = drug_group.agg({'Tumor Volume (mm3)': ["mean", "median", "var", "std", "sem"]})
# This method produces everything in a single groupby function
summary_table

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
drug_counts = cleaned_df['Drug Regimen'].value_counts()

drug_counts.plot(kind='bar')

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
plt.bar(drug_counts.index.values, drug_counts.values)


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_counts = cleaned_df['Sex'].value_counts()

sex_counts.plot(kind='pie')


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
#plt.pie(sex_counts.index.values)
# len(sex_counts['Male'].value_counts())
males = sex_counts['Male']
females = sex_counts['Female']
sexes = ['Males', 'Females']
sex_nums = [males, females]

plt.pie(sex_nums, labels=sexes)

## Quartiles, Outliers and Boxplots

In [97]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
big_four = merged_data.loc[(merged_data['Drug Regimen'] == "Capomulin") | 
                          (merged_data['Drug Regimen'] == "Ramicane") |
                          (merged_data['Drug Regimen'] == "Infubinol") |
                          (merged_data['Drug Regimen'] == "Ceftamin"), :]

big_four.head()

# Start by getting the last (greatest) timepoint for each mouse

# Start by getting the last (greatest) timepoint for each mouse
max_tumor = cleaned_df.groupby(["Mouse ID"])['Timepoint'].max()
max_tumor = max_tumor.reset_index()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
latest_data = max_tumor.merge(cleaned_df,on=['Mouse ID','Timepoint'],how="left")
merged_tumor_data.head()
# grouped_four = big_four.groupby('Mouse ID')
# big_four['Latest Timepoint'] = grouped_four['Timepoint'].max()

# big_four.head()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,a203,45,67.973419,2,Infubinol,Female,20,23
1,a251,45,65.525743,1,Infubinol,Female,21,25
2,a262,45,70.717621,4,Placebo,Female,17,29
3,a275,45,62.999356,3,Ceftamin,Female,20,28
4,a366,30,63.440686,1,Stelasyn,Female,16,29


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
