## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
complete_data = pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")
# Display the data table for preview

complete_data.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [None]:
# Checking the number of mice.
total_mice = len(complete_data["Mouse ID"].unique())

print(total_mice)

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
complete_data[complete_data.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)]["Mouse ID"].unique()

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
complete_data.loc[complete_data["Mouse ID"] == "g989"]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_complete_data=complete_data.loc[complete_data ["Mouse ID"]!= "g989"]

cleaned_complete_data.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
check = len(cleaned_complete_data["Mouse ID"].unique())
print(check)

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
Mean = cleaned_complete_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
Median = cleaned_complete_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
Variance = cleaned_complete_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
StdDev = cleaned_complete_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
StdErr = cleaned_complete_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Assemble the resulting series into a single summary dataframe.
Summary_data = pd.DataFrame({ "Mean Tumor Volume": Mean,
                               "Median Tumor Volume": Median,
                               "Tumor Volume Variance": Variance,
                               "Tumor Volume Std. Dev.":StdDev, 
                               "Tumor Volume Std. Err.":StdErr                              
                             })
Summary_data


In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
new_summary_statistic = cleaned_complete_data.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":["mean", "median", "var", "std", "sem"]})

new_summary_statistic

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
%matplotlib inline 


In [None]:
mice_data = cleaned_complete_data["Drug Regimen"].value_counts()

mice_data

In [None]:
plot_pandas= mice_data.plot(kind = "bar", color = 'purple')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of unique mice tested')
plt.title("Number of Mice Tested per Treatment")

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
drugs = mice_data.index.values
micefigures = mice_data.values

In [None]:
x_axis = np.arange(0,len(drugs))
tick_locations = []
for x in x_axis:
    tick_locations.append(x)
plt.title("Number of Mice Tested per Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")

plt.bar(x_axis, micefigures, facecolor="yellow", alpha=1, align="center")
plt.xticks(tick_locations, drugs, rotation = "vertical")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender =  cleaned_complete_data["Sex"].value_counts()
gender 

In [None]:
gender.plot(kind='pie', y='Sex', startangle = 360, autopct ='%1.1f%%' )
plt.title("Female vs. Male Mice")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = 'Male', 'Female'
sizes = [958, 922]


In [None]:
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct="%1.1f%%", shadow=True, startangle=360)
plt.title("Females vs. Male Mice")
plt.ylabel('Sex')
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
Capomulin_table = cleaned_complete_data.loc[cleaned_complete_data["Drug Regimen"] == "Capomulin", :]
Ramicane_table = cleaned_complete_data.loc[cleaned_complete_data["Drug Regimen"] == "Ramicane", :]
Infubinol_table = cleaned_complete_data.loc[cleaned_complete_data["Drug Regimen"] == "Infubinol", :]
Ceftamin_table = cleaned_complete_data.loc[cleaned_complete_data["Drug Regimen"] == "Ceftamin", :]

In [None]:
# Start by getting the last (greatest) timepoint for each mouse
LastTimepoint_Campomulin = Capomulin_table.groupby("Mouse ID")["Timepoint"].max()
Lasttimepoint_Ramicane = Ramicane_table.groupby("Mouse ID")["Timepoint"].max()
Lasttimepoint_Infubinol = Infubinol_table.groupby("Mouse ID")["Timepoint"].max()
Lasttimepoint_Ceftamin = Ceftamin_table.groupby("Mouse ID")["Timepoint"].max()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merged_campomulin = pd.merge( LastTimepoint_Campomulin,cleaned_complete_data,on=("Mouse ID","Timepoint"), how= "left")
merged_Ramicane = pd.merge(Lasttimepoint_Ramicane,cleaned_complete_data,on=("Mouse ID","Timepoint"), how= "left")
merged_Infubinol = pd.merge(Lasttimepoint_Infubinol,cleaned_complete_data,on=("Mouse ID","Timepoint"), how= "left")
merged_Ceftamin = pd.merge(Lasttimepoint_Ceftamin,cleaned_complete_data,on=("Mouse ID","Timepoint"), how= "left")

In [None]:
total_drugs = pd.concat([merged_campomulin,merged_Ramicane, merged_Infubinol,merged_Ceftamin],ignore_index=True)
total_drugs

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
vol_list = []

treatment = 0
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in treatment_list:

    print(f"{drug}'s Quartiles, Median and Outliers")
    print(f"******************************************")

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    finalvolumeforeachdrug = total_drugs.loc[total_drugs["Drug Regimen"]==drug]["Tumor Volume (mm3)"]
    vol_list.append(finalvolumeforeachdrug)
    
    # add subset 
    quartiles = finalvolumeforeachdrug.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    
    
    if treatment == 0:
    
        print(f"The lower quartile of {drug} is: {round(lowerq, 2)}")
        print(f"The upper quartile of {drug} is: {round(upperq, 2)}")
        print(f"The interquartile range of {drug} is: {round(iqr, 2)}")
        print(f"The median of {drug} drug is: {round(quartiles[0.5], 2)} ")
    
    # Determine outliers using upper and lower bounds
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    outliersifany = finalvolumeforeachdrug.loc[(finalvolumeforeachdrug < lower_bound) | (finalvolumeforeachdrug >  upper_bound)]

    print(f"Values below {round(lower_bound, 2)} could be outliers.")
    print(f"Values above {round(upper_bound, 2)} could be outliers.")
    print(f"{drug}'s potential ouliers: {outliersifany}")
    print(f"----------------------------------------------------------------------------------------")

treatment = treatment + 1   

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
data = [vol_list[0],vol_list[1], vol_list[2], vol_list[3]]
treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
fig1, ax1 = plt.subplots()
ax1.set_title('Final Drug Regimen')
ax1.set_ylabel('Final Tumor Volume(mm3)')
ax1.boxplot(data,flierprops = dict(marker='o', markerfacecolor='b', markersize=15,
                   linestyle='none', markeredgecolor='r'), labels=treatment_list)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
Campomulinonj246 = Capomulin_table.loc[(Capomulin_table["Mouse ID"]== "j246")]
Campomulinonj246

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume(mm3)")
plt.title("Campomulin treatment of mouse j246")
x_axis = Campomulinonj246["Timepoint"]
y_axis = Campomulinonj246["Tumor Volume (mm3)"]
plt.plot(x_axis, y_axis)
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
avg_campomulin_vol = Capomulin_table.groupby("Mouse ID").mean()
avg_campomulin_vol

In [None]:
correlation = st.pearsonr(avg_campomulin_vol["Weight (g)"],avg_campomulin_vol["Tumor Volume (mm3)"])
print(f"The correlation between both factors is {round(correlation[0],2)}")


In [None]:
x_values = avg_campomulin_vol["Weight (g)"]
y_values = avg_campomulin_vol["Tumor Volume (mm3)"]
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="yellow")
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()