# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
combined_data = pd.merge(mouse_metadata, study_results, how="right")
combined_data = combined_data[["Mouse ID", "Timepoint", "Tumor Volume (mm3)", "Metastatic Sites",
                               "Drug Regimen", "Sex", "Age_months", "Weight (g)"]]

# Display the data table for preview
combined_data.head()

In [None]:
# Checking the number of mice.
mice = combined_data["Mouse ID"].value_counts()
num_mice = len(mice)
num_mice

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = combined_data.loc[combined_data.duplicated(subset=["Mouse ID", "Timepoint",]),"Mouse ID"].unique()
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_DataFrame = combined_data.loc[combined_data["Mouse ID"] == "g989", :]
duplicate_mice_DataFrame

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data = combined_data[combined_data["Mouse ID"].isin(duplicate_mice) == False]
clean_data.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_mice = clean_data["Mouse ID"].value_counts()
clean_num_mice_df = len(clean_mice)
clean_num_mice_df

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen


# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

mean = clean_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
median = clean_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
var = clean_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
std = clean_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
sem = clean_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()

summary_stats = pd.DataFrame(
{
    "Mean Tumor Volume": mean,
    "Median Tumor Volume": median,
    "Tumor Volume Variance": var,
    "Tumor Volume Std. Dev.": std,
    "Tumor Volume Std. Err.": sem
})

summary_stats

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
summary_aggregation = clean_data.groupby(["Drug Regimen"])[["Tumor Volume (mm3)"]].agg(["mean", "median", "var", "std", "sem"])
summary_aggregation

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
count_of_mice = clean_data["Drug Regimen"].value_counts()

bar_plot_pandas = count_of_mice.plot.bar(color="tab:blue")

plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")

plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
count_of_mice = clean_data["Drug Regimen"].value_counts()

plt.bar(count_of_mice.index.values, count_of_mice.values, color="tab:blue")
plt.xticks(rotation=90)

plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_counts = clean_data["Sex"].value_counts()
gender_counts.plot.pie(autopct="%1.1f%%")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender_counts = clean_data["Sex"].value_counts()

plt.pie(gender_counts.values, labels=gender_counts.index.values, autopct="%1.1f%%",)
plt.ylabel("Sex")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
treatment_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
final_tumor_volume = clean_data[clean_data["Drug Regimen"].isin(treatment_regimens)]

# Start by getting the last (greatest) timepoint for each mouse
last_timepoint = final_tumor_volume.groupby("Mouse ID")["Timepoint"].max()
last_timepoint_DataFrame = pd.DataFrame(last_timepoint)

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
merged_data = pd.merge(last_timepoint_DataFrame, clean_data, on=["Mouse ID", "Timepoint"], how="inner")

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for treatment in treatment_regimens:

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    tumor_volumes = merged_data.loc[merged_data["Drug Regimen"] == treatment, "Tumor Volume (mm3)"] 
    
    # add subset 
    tumor_vol_data.append(tumor_volumes)
    quartiles = tumor_volumes.quantile([0.25, 0.5, 0.75])
    lower_quartile = quartiles[0.25]
    upper_quartile = quartiles[0.75]
    iqr = upper_quartile - lower_quartile
    
    # Determine outliers using upper and lower bounds
    lower_bound = lower_quartile - 1.5 * iqr
    upper_bound = upper_quartile + 1.5 * iqr
    outliers = tumor_volumes[(tumor_volumes < lower_bound) | (tumor_volumes > upper_bound)]
    print(f"{treatment}'s potential outliers: {outliers}")

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
flierprops={"markerfacecolor": "red", "markersize": 10}

plt.boxplot(tumor_vol_data, labels = treatment_regimens, flierprops=flierprops)
plt.ylabel("Final Tumor Volume (mm3)")

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
Capomulin_DataFrame = clean_data.loc[clean_data["Drug Regimen"] == "Capomulin",:]

forline_DataFrame = Capomulin_DataFrame.loc[Capomulin_DataFrame["Mouse ID"] == "l509",:]
forline_DataFrame.head()
x_axis = forline_DataFrame["Timepoint"]
tumor_size = forline_DataFrame["Tumor Volume (mm3)"]

fig1, ax1 = plt.subplots()
plt.plot(x_axis, tumor_size, linewidth=2, color="tab:blue")
plt.title("Capomulin treatment of mouse l509")
plt.xlabel("Timepoint (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
Capomulin_DataFrame = clean_data.loc[clean_data["Drug Regimen"] == "Capomulin", :]
capm_vol = Capomulin_DataFrame.groupby(["Mouse ID"])[["Weight (g)", "Tumor Volume (mm3)"]].mean()

plt.scatter(capm_vol["Weight (g)"], capm_vol["Tumor Volume (mm3)"], color="tab:blue")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume(mm3)")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
correlation_coefficient = st.pearsonr(capm_vol["Weight (g)"], capm_vol["Tumor Volume (mm3)"])
print(f"The correlation between mouse weight and the average tumor volume is {round(correlation_coefficient[0], 2)}")


(slope, intercept, rvalue, pvalue, stderr) = st.linregress(capm_vol["Weight (g)"], capm_vol["Tumor Volume (mm3)"])
regress_values = capm_vol["Weight (g)"] * slope + intercept
line_eq = f"y = {round(slope, 2)} x + {round(intercept, 2)}"


plt.scatter(capm_vol["Weight (g)"], capm_vol["Tumor Volume (mm3)"], color="tab:blue")
plt.plot(capm_vol["Weight (g)"], regress_values, color="red")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.show()