In [None]:
##Import libraries and such

import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

In [None]:
##Load in the data

mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

In [None]:
##Read the data

mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

In [None]:
##Combine the datasets 

combined_data = pd.merge(mouse_metadata, study_results, how = "outer", on = "Mouse ID")
combined_data.head()

In [None]:
##Give the newly merged dataset a once over for blanks

combined_data.count()

In [None]:
## Insert random fact about mice to entertain the grader
        ## A group of mice is called a "mischief"##

Remove Duplicates

In [None]:
search = combined_data.groupby(["Mouse ID","Timepoint"]).count()
find = search.loc[search["Drug Regimen"] > 1]

In [None]:
duplicate_mice = []

In [None]:
if len(find) > 0:
    for mouse in find.index:
        if mouse[0] not in duplicate_mice:
            duplicate_mice.append(mouse[0])

In [None]:
duplicate_mice

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
if len(duplicate_mice) > 0:
## For each mouse in the duplicate list, drop it from the dataframe
    for mouse in duplicate_mice:
        combined_data = combined_data.loc[combined_data["Mouse ID"] != mouse]

In [None]:
combined_data.count()

## SUMMARY STATISTICS

In [None]:
# Method 1:  Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

summarystats = pd.DataFrame(combined_data.groupby("Drug Regimen").count())

In [None]:
summarystats["Mean"] = pd.DataFrame(combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean())
summarystats["Median"] = pd.DataFrame(combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].median())
summarystats["Standard Deviation"] = pd.DataFrame(combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].std())
summarystats["Variance"] = pd.DataFrame(combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].var())
summarystats["SEM"] = pd.DataFrame(combined_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem())

In [None]:
summarystats = summarystats[["Mouse ID", "Mean", "Median", "Standard Deviation", "Variance", "SEM"]]

In [None]:
summarystats = summarystats.rename(columns = {"Mouse ID" : "Trials"})

In [None]:
summarystats.head()

In [None]:
# Method 2(optional): Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function. (https://pandas.pydata.org/pandas-docs/version/0.22.0/generated/pandas.core.groupby.DataFrameGroupBy.agg.html)


# BAR AND PIE CHARTS

In [None]:
# Use Pandas to generate a bar plot showing the total number of mice in each treatment regimen throughout the course of the study. 

## Note: this plot will be identical to the one that uses Pyplot

In [None]:
datapoints = summarystats[["Trials"]]

In [None]:
datapoints.plot(kind="bar", figsize=(6,4), color = "b", legend=False)
plt.title("Trials per Drug Regime")
plt.show()
plt.tight_layout()

In [None]:
# Use Pyplot to generate a bar plot showing the total number of mice treatment in each treatment regimen throughout the course of the study.

##  Note: this plot will be identical to the one that uses Pandas

In [None]:
x_axis = np.arange(len(datapoints))
tick_locations = [value for value in x_axis]

In [None]:
plt.figure(figsize=(6,4))
plt.bar(x_axis, datapoints["Trials"], color = "b", width = .5)
plt.xticks(tick_locations, datapoints.index.values, rotation="vertical")

plt.xlim(-0.75, len(x_axis)-.25)
plt.ylim(0, max(datapoints["Trials"])+10)

plt.title("Trials per Drug Regime")
plt.xlabel("Drug Regime")

plt.show()

In [None]:
# Use Pandas to generate a pie plot showing the distribution of female versus male mice

## Note: this plot will be identical to the one that uses Pyplot

In [None]:
datapoints = summarystats[["Trials"]]

In [None]:
datapoints.plot(kind="pie", explode=explode, labels=datapoints.index.values, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140, subplots=True,legend=False)

plt.title("Trials per Drug Regime")

plt.show()

In [None]:
# Use Pyplot to generate a pie plot showing the distribution of female versus male mice

##  Note: this plot will be identical to the one that uses Pandas

## QUARTILES, OUTLIERS AND BOXPLOTS

In [None]:
# Calculate the final tumor volume of each mouse across four of the drug regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (latest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
filtered_df = combined_data.loc[(combined_data["Drug Regimen"] == "Capomulin") | (combined_data["Drug Regimen"] == "Ramicane") | (combined_data["Drug Regimen"] == "Ceftamin") | (combined_data["Drug Regimen"] == "Propriva"), :]

In [None]:
filtered_df = filtered_df.sort_values("Timepoint", ascending = False)

In [None]:
filtered_df = filtered_df.drop_duplicates(subset="Mouse ID", keep='first')

In [None]:
quartiles = filtered_df['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

In [None]:
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

In [None]:
outliers_df = filtered_df.loc[(filtered_df['Tumor Volume (mm3)'] > upper_bound) | (filtered_df['Tumor Volume (mm3)'] < lower_bound), :]
outliers_df

Did not find outliers from above calculations

In [None]:
Tumor_Volume = filtered_df['Tumor Volume (mm3)']
fig1, ax1 = plt.subplots()
ax1.set_title('Tumor Volume of Mice')
ax1.set_ylabel('Tumor Volume')
ax1.boxplot(Tumor_Volume)
plt.show()

## LINE AND SCATTER PLOTS

In [None]:
# Generate a line plot of tumor volume vs. timepoint for a mouse treated with Capomulin

In [None]:

Capomulin_df = combined_data.loc[(combined_data["Drug Regimen"] == "Capomulin"),:]


timepoint = Capomulin_df["Timepoint"]
tumor_volume = Capomulin_df["Tumor Volume (mm3)"]


tumor_volume_line, = plt.plot(timepoint, tumor_volume)


plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume')
plt.title('Tumor Volume over Time for Capomulin Mice')
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen. 
# Note: this means mouse weight goes on the x-axis, with average tumor volume on the y-axis. 

In [None]:
mouse_weight = Capomulin_df.groupby(Capomulin_df["Mouse ID"])["Weight (g)"].mean()
tumor_volume = Capomulin_df.groupby(Capomulin_df["Mouse ID"])["Tumor Volume (mm3)"].mean()

plt.scatter(mouse_weight,tumor_volume)
plt.xlabel("Weight of Mouse")
plt.ylabel("Tumor Volume")
plt.show()

# CORRELATION AND REGRESSION

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen.

In [None]:

#values for x and y values
mouse_weight = Capomulin_df.groupby(Capomulin_df["Mouse ID"])["Weight (g)"].mean()
tumor_volume = Capomulin_df.groupby(Capomulin_df["Mouse ID"])["Tumor Volume (mm3)"].mean()

#linear regression on year versus violent crime rate
slope, int, r, p, std_err = st.linregress(mouse_weight, tumor_volume)
      

fit = slope * mouse_weight + int


plt.scatter(mouse_weight,tumor_volume)
plt.xlabel("Weight of Mouse")
plt.ylabel("Tumor Volume")
plt.plot(mouse_weight,fit,"--")
plt.xticks(mouse_weight, rotation=90)
plt.show()

# Caculate correlation coefficient
corr = round(st.pearsonr(mouse_weight,tumor_volume)[0],2)
print(f'The correlation between weight and tumor value is {corr}')