## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

%matplotlib notebook

# Set data filepaths
mouse_metadata_path = "Pymaceuticals/data/Mouse_metadata.csv"
study_results_path = "Pymaceuticals/data/Study_results.csv"

# Read the mouse data and reivew files results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# print(mouse_metadata, study_results)
# mouse_metadata.info()
# study_results.info()

# Combine the data into a single dataset
merge_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")
# merge_df.describe()

# Display the data table for preview
# merge_df.head()


In [None]:
# Checking the number of mice.
# merge_df["Mouse ID"].value_counts()


In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# merge_df.loc[merge_df.duplicated(['Mouse ID','Timepoint'])]


In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
# merge_df.loc[merge_df['Mouse ID'] == 'g989']


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = merge_df.drop_duplicates(subset=["Mouse ID", "Timepoint"], keep=False)
clean_df.head()


In [None]:
# Checking the number of mice in the clean DataFrame.
# NOTE:  Confirms same number of mice in dataset
# clean_df["Mouse ID"].value_counts()

# NOTE:  Confirms absence of any duplicate data 
#clean_df.loc[clean_df.duplicated(['Mouse ID','Timepoint'])]

## Summary Statistics

In [None]:
# Explore difference between series and dataframe for mean values

mean = round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean(), 2)
mean


In [None]:
ketmean = mean[3]
ketmean


In [None]:
mean_df = pd.DataFrame(mean)
mean_df

ketmean1 = mean_df.loc["Ketapril"]
ketmean1



In [None]:
# Create a summary table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

mean = round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean(), 2)
median = round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median(), 2)
var = round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var(), 2)
std = round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].std(), 2)
sem = round(clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem(), 2)

### GET RID OF THE INDIVIDUAL DATAFRAMING AND USED COMBINED STATSUM1 FUNCTION
#mean_df = pd.DataFrame({"Mean": mean})
#median_df = pd.DataFrame({"Median": median})
#var_df = pd.DataFrame({"Variance": var}) 
#std_df = pd.DataFrame({"Std. Dev.": std})
#sem_df = pd.DataFrame({"SEM": sem})


stats = {"Mean": mean, "Median": median, "Variance": var, "Std. Dev.": std, "Sem": sem}

statsum1_df = pd.DataFrame(stats)
statsum1_df


In [None]:
# Alternative method to create same table using agg function
# Use agg function to calculate statistical values acrss all drug regimens

columns = ["mean", "median", "var", "std", "sem"]
statsum2_df = round(clean_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)": columns}), 2)
statsum3_df = round(clean_df.agg({"Tumor Volume (mm3)": columns}), 2)
statsum2_df, statsum3_df


## Bar and Pie Charts

In [None]:
## Calcualte values for charts
# Group mouse ID's by regimen and return number of mice per regimen (.nunique()) and number of timepoints (.count())
mouse_count_df = clean_df.groupby("Drug Regimen")["Mouse ID"].unique()
mouse_count = clean_df.groupby("Drug Regimen")["Mouse ID"].nunique()
time_count = clean_df.groupby("Drug Regimen")["Mouse ID"].count()
timepermouse = round(time_count / mouse_count, 1)

# Summarize data available for bar chart
experiment = {"# of Mice": mouse_count, "Tot Timepoints": time_count, "Timepoints per Mouse": timepermouse}
pd.DataFrame(experiment)


In [None]:
# Create series to hold index values
regimen = mouse_count.index
regimen

In [None]:
## Panda chart:  Create bar chart with total number of mice / treatment throughout the course of study using pandas 
# NOTE:  Using timepoints vs. mice, since there were only 24-25 unique mice used in each study

# columns = ["mean", "median", "var", "std", "sem"]
# values = [mean, median, var, std, sem]

x_axis = np.arange(len(columns))

In [None]:
## Pyplot chart:  Create bar chart with total number of mice / treatment throughout the course of study using pyplot 
# Change the default size of chart to widen x-axis for better data display
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 5.5
plt.rcParams["figure.figsize"] = fig_size
print(fig_size)


In [None]:
# Generate chart
x_axis = np.arange(len(regimen))
plt.bar(x_axis, time_count, color="b", align="center")


In [None]:
# Format matplotlib chart

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, regimen)
plt.title("# of Mice per Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("# of Mice")


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

sex_df = clean_df.drop_duplicates(subset=["Mouse ID"], keep="first")
female = sex_df["Sex"].loc[sex_df["Sex"] == "Female"].count()
male = sex_df["Sex"].loc[sex_df["Sex"] == "Male"].count()
male


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

### NOTE:  ASSUMING INCORRECTLY THAT FINAL TUMOR VOLUME IS MIN TUMOR VOLUME; NEED TO FIX

# Step 1:  Calculate final tumor volume 
mintumor_df = clean_df.groupby(["Drug Regimen", "Mouse ID", ])["Tumor Volume (mm3)"].min()

pd.DataFrame(maxtumor_df)

captumor = round(mintumor_df["Capomulin"], 3)
ramtumor = round(mintumor_df["Ramicane"], 3)
inftumor = round(mintumor_df["Infubinol"], 3)
ceftumor = round(mintumor_df["Ceftamin"], 3)

# Calculate quartiles and print summary output  

capquart = captumor.quantile([.25, .5, .75])
ramquart = ramtumor.quantile([.25, .5, .75])
infquart = ramtumor.quantile([.25, .5, .75])
cefquart = ramtumor.quantile([.25, .5, .75])

sumout = {"Capomulin Qrtl": capquart, "Ramicane Qrtl": ramquart, "Infubinol Qrtl": infquart, "Ceftamin Qrtl": cefquart}
pd.DataFrame(sumout)



In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

# Step 1:  Review Capomulin mouse trial data (looking for high number of time points for graph)
capomulin_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin"]
capomulin_df[["Mouse ID", "Tumor Volume (mm3)", "Timepoint"]].groupby("Mouse ID").count()

# Step 2:  Select a mouse and create data for x-axis (timepoints) and y-axis (tummor volume)
mouse = capomulin_df.loc[capomulin_df["Mouse ID"] == "b128"]
x_axis = mouse["Timepoint"]
y_axis = mouse["Tumor Volume (mm3)"]

In [None]:
# Select a mouse and review it's data

mouse = capomulin_df.loc[capomulin_df["Mouse ID"] == "b128"]
mouse


In [None]:
# Create data for x-axis (timepoints) and y-axis (tummor volume)

x_axis = mouse["Timepoint"]
x_axis

y_axis = mouse["Tumor Volume (mm3)"]
y_axis
# pd.DataFrame(x_axis)
x_axis

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
plt.plot([1, 2, 3, 4, 5], [10, 11, 12, 13, 14])


In [None]:
# Show the graph that was created
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
