## Observations

In [None]:
# Capomulin and Ramicane are the most effective drugs.
# Propiva and Stelasyn had one less mouse than the other drugs which results in less accurate data.
# Propiva was the least effective drug, 
#  had the lowest average timepoint per mouse which means the mice are dieing faster than the other drugs.

## Code

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as sts

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [None]:
mouse_metadata.head()

In [None]:
mouse_metadata.shape

In [None]:
mouse_metadata.info()

In [None]:
study_results.head()

In [None]:
study_results.shape

In [None]:
study_results.info()

In [None]:
# Combine the data into a single dataset
df = study_results.merge(mouse_metadata, on="Mouse ID", how="inner")
# Display the data table for preview
df.head(10)

In [None]:
df.info()

In [None]:
df["Mouse ID"].value_counts()

In [None]:
# Checking the number of mice.
df["Mouse ID"].nunique()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
sum(df.duplicated(subset=['Mouse ID', 'Timepoint']))

In [None]:
(df['Mouse ID'] + df['Timepoint'].astype(str)).value_counts()

In [None]:
mask = df.duplicated(subset=['Mouse ID', 'Timepoint'])
df.loc[mask]

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
df.iloc[860:873, :]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mask2 = df["Mouse ID"] != "g989"
the_df = df.loc[mask2].reset_index(drop=True)
the_df.info()

In [None]:
the_df.describe()

In [None]:
# Checking the number of mice in the clean DataFrame.
the_df['Mouse ID'].nunique()

In [None]:
the_df.head()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

drug_grouped = the_df.groupby('Drug Regimen')

avg_tumor = drug_grouped['Tumor Volume (mm3)'].mean()
med_tumor = drug_grouped['Tumor Volume (mm3)'].median()
var_tumor = drug_grouped['Tumor Volume (mm3)'].var()
stdev_tumor = drug_grouped['Tumor Volume (mm3)'].std()
sem_tumor = drug_grouped['Tumor Volume (mm3)'].sem()

tumor_table = pd.DataFrame()
tumor_table['Mean Tumor Volume'] = avg_tumor
tumor_table['Median Tumor Volume'] = med_tumor
tumor_table['Variance of Tumor Volume'] = var_tumor
tumor_table['Standard Deviation of Tumor Volume'] = stdev_tumor
tumor_table['Standard Error of Tumor Volume'] = sem_tumor
tumor_table

In [None]:
the_df.groupby("Drug Regimen").size()

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
drug_grouped['Tumor Volume (mm3)'].agg(['mean', 'median', 'var', 'std', 'sem'])

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
plt.figure(figsize=(10,6))

the_df['Drug Regimen'].value_counts().plot(kind = 'bar', color = "indigo")

plt.title("Number of Unique Timepoints per Drug", fontsize=18, fontweight = 'bold')
plt.xlabel('Drug Regimen', fontsize=16)
plt.ylabel('Number of Unique Timepoints', fontsize=16)
plt.show()

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.
mice_drugged = the_df['Drug Regimen'].value_counts()

x = mice_drugged.index
y = mice_drugged.values

plt.figure(figsize=(10,6))
plt.bar(x, y, color='indigo')
plt.title("Number of Unique Timepoints per Drug", fontsize=18, fontweight = 'bold')
plt.xlabel('Drug Regimen', fontsize=16)
plt.ylabel('Number of Unique Timepoints', fontsize=16)
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
the_df.groupby("Sex")["Mouse ID"].count().plot(kind='pie')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
genders = the_df.groupby("Sex")["Mouse ID"].count()
gender = list(genders.index)
gender_total = list(genders.values)

colors = ['indigo', 'forestgreen']
plt.pie(gender_total, labels=gender, colors=colors)
plt.axis("equal")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
max_tumor = the_df.groupby(['Mouse ID'])['Timepoint'].max()
max_tumor = max_tumor.reset_index()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merged_df = max_tumor.merge(the_df, on=['Mouse ID', 'Timepoint'], how='left')
merged_df.rename(columns={"Tumor Volume (mm3)": "Final Tumor Volume (mm3)"}, inplace=True)
merged_df.head(10)

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
drug_data = []

for drugs in treatments:
    data = merged_df.loc[merged_df["Drug Regimen"] == drugs, "Final Tumor Volume (mm3)"]
    
    # If the data is in a dataframe, we use pandas to give quartile calculations
    quartiles = data.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    
    print(drugs)
    print()
    print(f"The lower quartile of the tumor volumes is: {lowerq}")
    print(f"The upper quartile of the tumor volumes is: {upperq}")
    print(f"The interquartile range of the tumor volumes is: {iqr}")
    print(f"The the median of the tumor volumes is: {quartiles[0.5]} ")

    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f"Values below {lower_bound} could be outliers.")
    print(f"Values above {upper_bound} could be outliers.")
    
    print()
    print()

    drug_data.append(data.values)



# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
plt.figure(figsize=(10,6))
plt.boxplot(drug_data, labels = treatments)

plt.title("Final Tumor Volume per Regimen", fontsize=18, fontweight = 'bold')
plt.ylabel('Tumor Volume')
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
plt.figure(figsize=(10,6))
mouse = the_df.loc[the_df['Mouse ID'] == "x401"].reset_index(drop=True)

x = mouse.Timepoint
y = mouse["Tumor Volume (mm3)"]


plt.plot(x, y, color='indigo')
plt.title("Mouse X401 Tumor Size", fontsize=18, fontweight = 'bold')
plt.xlabel('Time', fontsize=16)
plt.ylabel('Size of Tumor', fontsize=16)
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
plt.figure(figsize=(10,6))

cap_weight_df = the_df.loc[the_df['Drug Regimen'] == "Capomulin"].groupby('Mouse ID')["Weight (g)", "Tumor Volume (mm3)"].mean().reset_index()

x = cap_weight_df['Weight (g)']
y = cap_weight_df['Tumor Volume (mm3)']

plt.scatter(x, y, color='indigo')
plt.title("Average Tumor Volume vs Mouse Weight on Capomulin", fontsize=18, fontweight = 'bold')
plt.xlabel('Weight (g)', fontsize=16)
plt.ylabel('Size of Tumor', fontsize=16)
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
cap_weight_df.corr()

In [None]:
sts.pearsonr(cap_weight_df['Weight (g)'], cap_weight_df["Tumor Volume (mm3)"])

In [None]:
(slope, intercept, rvalue, pvalue, stderr) = sts.linregress(x, y)
regress_values = x * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
print(line_eq)
print(f"The r-squared is: {rvalue**2}")

plt.figure(figsize=(10,6))

plt.scatter(x, y, color = "indigo")
plt.title("Capomulin Mice Weight vs Tumor Size", fontsize=18, fontweight="bold")
plt.xlabel("Mouse Weight (g)")
plt.ylabel("Tumor Size (mm3)")
plt.plot(x, regress_values, color="forestgreen")
plt.annotate(line_eq,(22,36),fontsize=15,color="indigo")

plt.show()