## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_metadata

In [None]:
study_results

In [None]:
# Combine the data into a single dataset
mouse_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
mouse_df.head()

In [None]:
# Checking the number of mice.
mouse_count = mouse_df["Mouse ID"].count()
mouse_count

In [None]:
# Checking the distinct number of mice.
distinctmouse_count = mouse_df["Mouse ID"].nunique()
distinctmouse_count

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = mouse_df[mouse_df.duplicated(["Mouse ID", "Timepoint"])]
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
allduplicate_mice = mouse_df[mouse_df.duplicated(['Mouse ID',])]
allduplicate_mice.head()

In [None]:
uniquemice_df = mouse_df.drop_duplicates(subset=["Mouse ID","Timepoint"])
uniquemice_df

In [None]:
# Checking the number of mice in the clean DataFrame.
uniquemice_count = uniquemice_df["Mouse ID"].count()
uniquemice_count

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, & SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

summary_data = uniquemice_df[["Drug Regimen", "Tumor Volume (mm3)"]]
summary_data

In [None]:
grouped_data = summary_data.groupby(["Drug Regimen"])

summary_df = pd.DataFrame({
    "Mean": grouped_data["Tumor Volume (mm3)"].mean().map('{:.2f}'.format),
    "Median": grouped_data["Tumor Volume (mm3)"].median().map('{:.2f}'.format),
    "Variance": grouped_data["Tumor Volume (mm3)"].var().map('{:.2f}'.format),
    "Standard Deviation": grouped_data["Tumor Volume (mm3)"].std().map('{:.2f}'.format),
    "SEM": grouped_data["Tumor Volume (mm3)"].sem().map('{:.2f}'.format)
})

summary_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, & SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line

aggdata = summary_data.groupby(["Drug Regimen"])
altsummary = aggdata.agg(['mean','median','var','std','sem'])["Tumor Volume (mm3)"]
altsummary


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.

# Drug Regimen DF and count
regimen_group = uniquemice_df.groupby('Drug Regimen')
regimen_count = pd.DataFrame(regimen_group['Drug Regimen'].count())

# Bar Chart
bar_chart = regimen_count.plot(kind='bar', title="Total Measurements Taken by Drug Regimen", color="blue")

# Assign X and Y labels
bar_chart.set_xlabel("Drug Regimen")
bar_chart.set_ylabel("Count")
plt.tight_layout()

# Save chart
plt.savefig("Charts/BarChartPandas.png")

plt.show()


In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.

# Set X Axis and Tick Mark Locations
X_Axis = np.arange(len(regimen_count))
tick_locations = [value for value in X_Axis]

# Set Size, Color,tick style
plt.figure(figsize=(16,7))
plt.bar(X_Axis, regimen_count['Drug Regimen'], color='green', alpha=0.4)
plt.xticks(tick_locations, list(regimen_count.index), rotation="0") 

# Set Limits
plt.xlim(-0.75, len(X_Axis) - 0.25)
plt.ylim(0, 275)

# Set Labels and Titles
plt.title("Total Measurements Taken by Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Average Tumor Volume (mm3)")
plt.tight_layout()

# Save chart
plt.savefig("Charts/BarChartPyplot.png")

plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mice_gender = uniquemice_df.groupby('Sex')
# mice_gender.head()

mice_gender_count = pd.DataFrame(mice_gender["Sex"].count())
# mice_gender_count.head()

mice_gender_count = uniquemice_df["Sex"].value_counts()
plt.figure()
mice_gender_count.plot(kind="pie", autopct='%1.1f%%',shadow=False, startangle=140, fontsize=14, colors =["gray","purple"], legend =False)
plt.tight_layout()
plt.axis("equal")
plt.title("Distribution of Male versus Female Mice")
plt.tight_layout()
plt.savefig("Charts/PieChartPandas.png")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Get Labels for our pie chart from the index values of the groupby dataframe

micegender = list(mice_gender_count.index.values)
# mice_genders = ["Female"],["Male"]

# # The values of each section of the pie chart
# mice_gender_counts = mice_gender_count["Sex"].count

# The colors of each section of the pie chart
colors = ["gray", "purple"]


# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(mice_gender_count, labels=micegender, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.rcParams['font.size'] = 16
plt.title("Distribution of Male versus Female Mice")
plt.ylabel("Sex")
plt.axis("equal")
plt.savefig("Charts/PieChartPyplot.png")
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# # Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# # Capomulin, Ramicane, Infubinol, and Ceftamin

Capomulin_df = uniquemice_df.loc[uniquemice_df["Drug Regimen"] == "Capomulin",:]
Ramicane_df = uniquemice_df.loc[uniquemice_df["Drug Regimen"] == "Ramicane", :]
Infubinol_df = uniquemice_df.loc[uniquemice_df["Drug Regimen"] == "Infubinol", :]
Ceftamin_df = uniquemice_df.loc[uniquemice_df["Drug Regimen"] == "Ceftamin", :]

In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds

#Capomulin

Capomulinlast = Capomulin_df.groupby('Mouse ID').max()['Timepoint']
Capomulinlastvol = pd.DataFrame(Capomulinlast)
Capomulinlastmerge = pd.merge(Capomulinlastvol, uniquemice_df, on=("Mouse ID","Timepoint"),how="left")
Capomulinlastmerge.head(5)

In [None]:
Capomulintumors = Capomulinlastmerge["Tumor Volume (mm3)"]

quartiles = Capomulintumors.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Capomulin potential outliers could be values below {lower_bound} and above {upper_bound} could also be outliers.")

#Ramicane

Ramicanelast = Ramicane_df.groupby('Mouse ID').max()['Timepoint']
Ramicanelastvol = pd.DataFrame(Ramicanelast)
Ramicanelastmerge = pd.merge(Ramicanelastvol, uniquemice_df, on=("Mouse ID","Timepoint"),how="left")
Ramicanelastmerge.head(10)

Ramicanetumors = Ramicanelastmerge["Tumor Volume (mm3)"]

quartiles2 = Ramicanetumors.quantile([.25,.5,.75])
lowerq2 = quartiles2[0.25]
upperq2 = quartiles2[0.75]
iqr2 = upperq2-lowerq2

lower_bound2 = lowerq2 - (1.5*iqr2)
upper_bound2 = upperq2 + (1.5*iqr2)
print(f"Ramicane potential outliers could be values below {lower_bound2} and above {upper_bound2} could be outliers.")



#Infubinol
Infubinollast = Infubinol_df.groupby('Mouse ID').max()['Timepoint']
Infubinollastvol = pd.DataFrame(Infubinollast)
Infubinollastmerge = pd.merge(Infubinollastvol, uniquemice_df, on=("Mouse ID","Timepoint"),how="left")
Infubinollastmerge.head(10)

Infubinoltumors = Infubinollastmerge["Tumor Volume (mm3)"]

quartiles3 = Infubinoltumors.quantile([.25,.5,.75])
lowerq3 = quartiles3[0.25]
upperq3 = quartiles3[0.75]
iqr3 = upperq3-lowerq3

lower_bound3 = lowerq3 - (1.5*iqr3)
upper_bound3 = upperq3 + (1.5*iqr3)
print(f"Infubinol potential outliers could be values below {lower_bound3} and above {upper_bound3} could be outliers.")


#Ceftamin
Ceftaminlast = Ceftamin_df.groupby('Mouse ID').max()['Timepoint']
Ceftaminlastvol = pd.DataFrame(Ceftaminlast)
Ceftaminlastmerge = pd.merge(Ceftaminlastvol, uniquemice_df, on=("Mouse ID","Timepoint"),how="left")
Ceftaminlastmerge.head(10)

Ceftamintumors = Ceftaminlastmerge["Tumor Volume (mm3)"]

quartiles4 = Ceftamintumors.quantile([.25,.5,.75])
lowerq4 = quartiles4[0.25]
upperq4 = quartiles4[0.75]
iqr4 = upperq4-lowerq4

lower_bound4 = lowerq4 - (1.5*iqr4)
upper_bound4 = upperq4 + (1.5*iqr4)
print(f"Ceftamin potential outliers could be values below {lower_bound4} and above {upper_bound4} could be outliers.")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
plot_tumor = [Capomulintumors, Ramicanetumors, Infubinoltumors, Ceftamintumors]

fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume by Drug Regimen')
ax1.set_ylabel('Final Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')
plt.rcParams['font.size'] = 14
ax1.boxplot(plot_tumor, labels=["Capomulin","Ramicane","Infubinol","Ceftamin",])

plt.savefig("Charts/BoxWhisker.png")

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_treatment = uniquemice_df.loc[uniquemice_df['Drug Regimen'] == 'Capomulin']

capomulin_treatment.head()

In [None]:
capomulin_vol_tumor = capomulin_treatment.groupby(['Timepoint']).agg(Tumor_Vol_Mean=('Tumor Volume (mm3)', np.mean),).round(3)

# View the groupby dataframe 
capomulin_vol_tumor.head(10)


In [None]:
# get the values for the x axis
x_series = list(capomulin_vol_tumor.index.values)

# format x values, lines and width of line
plt.errorbar(
    x_series, 
    capomulin_vol_tumor['Tumor_Vol_Mean'],
    label="Time Series of Tumor Volume for Capomulin",
    fmt="bs-", 
    linewidth=2
    )

# Add the descriptive title, x labels and y labels
plt.title("Time Series of Tumor Volume for Capomulin")
plt.xlabel("Time (days)")
plt.ylabel("Tumor Volume (mm3)")

# Set x and y limits 
plt.xlim(min(x_series)-max(x_series)*0.05, max(time_series)*1.05)
plt.ylim(min(capomulin_vol_tumor['Tumor_Vol_Mean'])*0.95, max(capomulin_vol_tumor['Tumor_Vol_Mean'])*1.05)
plt.rcParams["figure.figsize"] = [8,7]

# Save the figure
plt.savefig("Charts/LineTumorVolTime.png")

plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
capomulin_treatment.head()

capomulin_mouse_id = capomulin_treatment.groupby(['Mouse ID']).agg(Mouse_weight=('Weight (g)', np.mean), Tumor_vol_avg=('Tumor Volume (mm3)', np.mean)\
).round(3)
capomulin_mouse_id.head(30)

#create scatter plot from the above dataframe
plt.scatter(
   capomulin_mouse_id['Mouse_weight'],
   capomulin_mouse_id['Tumor_vol_avg'],
    marker='o',
    facecolors='red',
    edgecolors='black',
    s=capomulin_mouse_id['Tumor_vol_avg'],
    alpha=.75)

# Create a title, x label, and y label for our chart
plt.title("Mouse Weight vs. Avg. Tumor Volume")
plt.xlabel("Mouse weight (g)")
plt.ylabel("Tumor Volume (mm3)")
# Save the figure
plt.savefig("Charts/ScatterWeightTumor.png")

plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 

correlationcoeff = st.pearsonr(capomulin_mouse_id['Mouse_weight'],capomulin_mouse_id['Tumor_vol_avg'])
print(f"The correlation between both factors is {round(correlationcoeff[0],2)}")

# for mouse weight and average tumor volume for the Capomulin regimen
