In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

In [None]:
# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

In [None]:
# Read the mouse metadata file
mouse_metadata = pd.read_csv(mouse_metadata_path)
mouse_metadata

In [None]:
# Read the mouse metadata file
study_results = pd.read_csv(study_results_path)
study_results

In [None]:
# Merge our two data frames together
combined_mousestudy_df = pd.merge(mouse_metadata, study_results, how='outer', on="Mouse ID")

In [None]:
# Display the data table for preview
combined_mousestudy_df.head()

In [None]:
# Checking the number of mice.
mice_count = combined_mousestudy_df["Mouse ID"].nunique()
mice_count

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mouse = combined_mousestudy_df.loc[combined_mousestudy_df.duplicated(subset=["Mouse ID", "Timepoint",]), "Mouse ID"].unique()
duplicate_mouse

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mouse_rows = combined_mousestudy_df.loc[combined_mousestudy_df["Mouse ID"] == "g989", :]
duplicate_mouse_rows

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
combined_duplicate_removal = combined_mousestudy_df[combined_mousestudy_df['Mouse ID'].isin(duplicate_mouse)==False]
combined_duplicate_removal.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
duplicate_removal_clean = combined_duplicate_removal["Mouse ID"].nunique()
duplicate_removal_clean

# Summary Statistics 

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
data_mean = combined_duplicate_removal['Tumor Volume (mm3)'].groupby(combined_duplicate_removal['Drug Regimen']).mean()
data_median = combined_duplicate_removal['Tumor Volume (mm3)'].groupby(combined_duplicate_removal['Drug Regimen']).median()
data_variance = combined_duplicate_removal['Tumor Volume (mm3)'].groupby(combined_duplicate_removal['Drug Regimen']).var()
data_std = combined_duplicate_removal['Tumor Volume (mm3)'].groupby(combined_duplicate_removal['Drug Regimen']).std()
data_sem = combined_duplicate_removal['Tumor Volume (mm3)'].groupby(combined_duplicate_removal['Drug Regimen']).sem()

summary_stat_table = pd.DataFrame({"Mean Tumor Volume": data_mean,
                                  "Median Tumor Volume": data_median,
                                  "Tumor Volume Variance": data_variance,
                                  "Tumor Volume Standard Deviation": data_std,
                                  "Tumor Volume SEM": data_sem})
# Assemble the resulting series into a single summary DataFrame.
summary_stat_table

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)
summary_aggregate = combined_duplicate_removal.groupby(['Drug Regimen'])[['Tumor Volume (mm3)']].agg(['mean', 'median', 'var', 'std', 'sem'])

# Using the aggregation method, produce the same summary statistics in a single line
summary_aggregate

# Bar and Pie Charts

In [None]:
# total drug regimen for each mouse
drug_per_mouse = combined_duplicate_removal["Drug Regimen"].value_counts()
drug_per_mouse

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
mouse_bar_plot = drug_per_mouse.plot.bar(color="blue", alpha=0.5)

# Set the x and y labels
plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.title("# of Mouse per Treatment")

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
x_axis = drug_per_mouse.index.values
y_axis = drug_per_mouse.values

# Create a pyplot using the data from previous plot.bar
plt.bar(x_axis, y_axis, color="blue", alpha=0.5, align='center')

# Set the xlabel and ylabel, title using class methods
plt.title("# of Mouse per Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.xticks(rotation="vertical")

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_data = combined_duplicate_removal["Sex"].value_counts()
plt.title("Female vs Male mice")
gender_data.plot.pie(autopct="%1.1f%%")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ['Female', 'Male']
sizes = [49, 51]
plot = gender_data.plot.pie(y='Total Count', autopct="%1.1f%%")
plt.title("Female vs Male mice")
plt.ylabel('Sex')
plt.show()

# Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
Capomulin_df = combined_duplicate_removal.loc[combined_duplicate_removal["Drug Regimen"] == "Capomulin", :]
Ramicane_df = combined_duplicate_removal.loc[combined_duplicate_removal["Drug Regimen"] == "Ramicane", :]
Infubinol_df = combined_duplicate_removal.loc[combined_duplicate_removal["Drug Regimen"] == "Infubinol", :]
Ceftamin_df = combined_duplicate_removal.loc[combined_duplicate_removal["Drug Regimen"] == "Ceftamin", :]
Capomulin_df.head()

In [None]:
# Start by getting the last (greatest) timepoint for each mouse
# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
# start from top to bottom. First Capomulin
Capomulin_last = Capomulin_df.groupby('Mouse ID').max()['Timepoint']
Capomulin_volume = pd.DataFrame(Capomulin_last)
Capomulin_merge = pd.merge(Capomulin_volume, combined_duplicate_removal, on=("Mouse ID","Timepoint"),how="left")
Capomulin_merge.head()

In [None]:
# Capture tumor volume
Capomulin_tumors = Capomulin_merge["Tumor Volume (mm3)"]

In [None]:
# Capture Capomulin quartiles, lowerq, upper and iqr
quartile = Capomulin_tumors.quantile([.25,.5,.75])
lowerq = quartile[0.25]
upperq = quartile[0.75]
iqr = upperq-lowerq

In [None]:
# Print tumor quartiles
print(f"The lower quartile of Capomulin tumors: {lowerq}")
print(f"The upper quartile of Capomulin tumors: {upperq}")
print(f"The interquartile range of Capomulin tumors: {iqr}")
print(f"The median of Capomulin tumors: {quartile[0.5]} ")

In [None]:
# Capture Capomulin outliers
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

In [None]:
# Print outliter values
print(f"Capomulin values below {lower_bound} and above {upper_bound} could be potential outliers.")

In [None]:
# Second Ramicane (2)
Ramicane_last = Ramicane_df.groupby('Mouse ID').max()['Timepoint']
Ramicane_volume = pd.DataFrame(Ramicane_last)
Ramicane_merge = pd.merge(Ramicane_volume, combined_duplicate_removal, on=("Mouse ID","Timepoint"),how="left")
Ramicane_merge.head()

In [None]:
# Capture tumor volume
Ramicane_tumors = Ramicane_merge["Tumor Volume (mm3)"]

In [None]:
# Capture Ramicane in quartiles, lowerq, upper and iqr
quartile2 = Ramicane_tumors.quantile([.25,.5,.75])
lowerq2 = quartile2[0.25]
upperq2 = quartile2[0.75]
iqr2 = upperq2-lowerq2

In [None]:
# Print tumor quartiles
print(f"The lower quartile of Ramicane tumors: {lowerq2}")
print(f"The upper quartile of Ramicane tumors: {upperq2}")
print(f"The interquartile range of Ramicane tumors: {iqr2}")
print(f"The median of Ramicane tumors: {quartile2[0.5]} ")

In [None]:
# Capture Ramicane outliers
lower_bound2 = lowerq2 - (1.5*iqr)
upper_bound2 = upperq2 + (1.5*iqr)

In [None]:
# Print outliter values
print(f"Ramicane values below {lower_bound2} and above {upper_bound2} could be potential outliers.")

In [None]:
# Third Infubinol (3)
Infubinol_last = Infubinol_df.groupby('Mouse ID').max()['Timepoint']
Infubinol_volume = pd.DataFrame(Infubinol_last)
Infubinol_merge = pd.merge(Infubinol_volume, combined_duplicate_removal, on=("Mouse ID","Timepoint"),how="left")
Infubinol_merge.head()

In [None]:
# Capture tumor volume
Infubinol_tumors = Infubinol_merge["Tumor Volume (mm3)"]

In [None]:
# Capture Infubinol in quartiles, lowerq, upper and iqr
quartile3 = Infubinol_tumors.quantile([.25,.5,.75])
lowerq3 = quartile3[0.25]
upperq3 = quartile3[0.75]
iqr3 = upperq3-lowerq3

In [None]:
# Print tumor quartiles
print(f"The lower quartile of Infubinol tumors: {lowerq3}")
print(f"The upper quartile of Infubinol tumors: {upperq3}")
print(f"The interquartile range of Infubinol tumors: {iqr3}")
print(f"The median of Infubinol tumors: {quartile3[0.5]} ")

In [None]:
# Capture Infubinol outliers
lower_bound3 = lowerq3 - (1.5*iqr)
upper_bound3 = upperq3 + (1.5*iqr)

In [None]:
# Print outliter values
print(f"Infubinol values below {lower_bound3} and above {upper_bound3} could be potential outliers.")

In [None]:
# Fourth Ceftamin (4)
Ceftamin_last = Ceftamin_df.groupby('Mouse ID').max()['Timepoint']
Ceftamin_volume = pd.DataFrame(Ceftamin_last)
Ceftamin_merge = pd.merge(Ceftamin_volume, combined_duplicate_removal, on=("Mouse ID","Timepoint"),how="left")
Ceftamin_merge.head()

In [None]:
# Capture tumor volume
Ceftamin_tumors = Ceftamin_merge["Tumor Volume (mm3)"]

In [None]:
# Capture Ceftamin in quartiles, lowerq, upper and iqr
quartile4 = Ceftamin_tumors.quantile([.25,.5,.75])
lowerq4 = quartile4[0.25]
upperq4 = quartile4[0.75]
iqr4 = upperq4-lowerq4

In [None]:
# Print tumor quartiles
print(f"The lower quartile of Ceftamin tumors: {lowerq4}")
print(f"The upper quartile of Ceftamin tumors: {upperq4}")
print(f"The interquartile range of Ceftamin tumors: {iqr4}")
print(f"The median of Ceftamin tumors: {quartile4[0.5]} ")

In [None]:
# Capture Ceftamin outliers
lower_bound4 = lowerq4 - (1.5*iqr)
upper_bound4 = upperq4 + (1.5*iqr)

In [None]:
# Print outliter values
print(f"Infubinol values below {lower_bound4} and above {upper_bound4} could be potential outliers.")

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
plot_data = [Capomulin_tumors, Ramicane_tumors, Infubinol_tumors, Ceftamin_tumors]

fig1, ax1 = plt.subplots()
ax1.set_title('Tumors')
ax1.set_ylabel('Final Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')

ax1.boxplot(plot_data, labels=["Capomulin","Ramicane","Infubinol","Ceftamin",])

plt.show()

# Line and Scatter Plots

In [None]:
# Display Mouse ID by desired lineplot "l509"
plot_line = Capomulin_df.loc[Capomulin_df["Mouse ID"] == "l509", :]
plot_line.head()

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin

timepoint_x_axis = plot_line['Timepoint']
Tumorvol_y_axis = plot_line['Tumor Volume (mm3)']

plt.title('Capomulin treatment of mouse l509')
plt.plot(timepoint_x_axis, Tumorvol_y_axis,linewidth=2, markersize=10)
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')

plt.show()

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
Capomulin_scatter_plot = Capomulin_df.groupby(["Mouse ID"]).mean()
plt.scatter(Capomulin_scatter_plot['Weight (g)'],Capomulin_scatter_plot['Tumor Volume (mm3)'])
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')

plt.show()

# Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
correlation = round(st.pearsonr(Capomulin_scatter_plot['Weight (g)'], Capomulin_scatter_plot['Tumor Volume (mm3)'])[0],2)
print(f"The correlation between mouse weight and average tumor volume is {correlation}")

In [None]:
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
# Add the linear regression equation and results
linear_reg = st.linregress(Capomulin_scatter_plot['Weight (g)'],Capomulin_scatter_plot['Tumor Volume (mm3)'])
linear_reg

In [None]:
# Use linregress results to obtain slope and intercept
m_slope = 0.9544396890241045
m_intercept = 21.552160532685015

In [None]:
y_values = Capomulin_scatter_plot['Weight (g)']*m_slope+m_intercept
plt.scatter(Capomulin_scatter_plot['Weight (g)'],Capomulin_scatter_plot['Tumor Volume (mm3)'])
plt.plot(Capomulin_scatter_plot['Weight (g)'],y_values,color="r")
plt.ylabel("Average Tumor Volume (mm3)")

plt.show()