## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


# Combine the data into a single dataset
merged_data = pd.merge(mouse_metadata, study_results, on='Mouse ID')

# Display the data table for preview
merged_data.head()

In [None]:
# Checking the number of mice.
merged_data['Mouse ID'].count()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
merged_data['Mouse ID'].value_counts()

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

clean_study_data_complete = merged_data[merged_data['Mouse ID'].isin(merged_data)==False]
clean_study_data_complete.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
 clean_study_data_complete['Mouse ID'].value_counts()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

mean = clean_study_data_complete.groupby('Drug Regimen').mean()['Tumor Volume (mm3)']
median = clean_study_data_complete.groupby('Drug Regimen').median()['Tumor Volume (mm3)']
var = clean_study_data_complete.groupby('Drug Regimen').var()['Tumor Volume (mm3)']
std_dev = clean_study_data_complete.groupby('Drug Regimen').std()['Tumor Volume (mm3)']
sem = clean_study_data_complete.groupby('Drug Regimen').sem()['Tumor Volume (mm3)']

sumstat = pd.DataFrame(mean)
sumstats = sumstat.rename(columns={"Tumor Volume (mm3)": "Mean"})

sumstats["Median"] = median
sumstats["Variance"] = var
sumstats["Standard Deviation"] = std_dev
sumstats["SEM"] = sem

sumstats
# This method is the most straighforward, creating multiple series and putting them all together at the end.



In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

stats_data = clean_study_data_complete.groupby('Drug Regimen')
stat_sum = stats_data.agg(['mean','median','var','std','sem'])["Tumor Volume (mm3)"]
stat_sum
# This method produces everything in a single groupby function


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

regimen_data_points = clean_study_data_complete.groupby(["Drug Regimen"]).count()["Mouse ID"]

regimen_data_points.plot(kind="bar")

#set chart label
plt.title("Mice Trials per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Total Number of Mice")

plt.show()


In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = np.arange(len(regimen_data_points))

tick_locations = [value for value in x_axis]


plt.title("Mice Trials per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Total Number of Mice")
plt.bar(x_axis, regimen_data_points, color = "g", width = .5)
plt.xticks(tick_locations, regimen_data_points.index.values, rotation="vertical")


plt.xlim(-0.5, len(x_axis)-.5)
plt.ylim(0, max(regimen_data_points)+5)
plt.figure(figsize=(6,4))


plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
cir_data = clean_study_data_complete.groupby(["Sex"]).count()["Mouse ID"]
color = ["yellow","green"]
explodes = (0,0)

plt.pie(cir_data, explode=explodes, labels=cir_data.index.values, colors=color,
        autopct="%1.1f%%", shadow=True, startangle=140)

# Set a title for the chart
plt.title("Distribution by Sex")
plt.ylabel('Sex')


plt.show()



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender = clean_study_data_complete.groupby("Sex").count()
gender_cnt = [gender["Mouse ID"]]


secondpie = gender["Mouse ID"].plot.pie(y='gender_cnt', figsize=(5, 5), autopct="%1.1f%%")
plt.title('Distribution by Sex')
plt.ylabel('Sex')

plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
capomulin_df = clean_study_data_complete.loc[clean_study_data_complete["Drug Regimen"] == "Capomulin",:]
ramicane_df = clean_study_data_complete.loc[clean_study_data_complete["Drug Regimen"] == "Ramicane", :]
infubinol_df = clean_study_data_complete.loc[clean_study_data_complete["Drug Regimen"] == "Infubinol", :]
ceftamin_df = clean_study_data_complete.loc[clean_study_data_complete["Drug Regimen"] == "Ceftamin", :]



# Start by getting the last (greatest) timepoint for each mouse
capomulin_last = capomulin_df.groupby('Mouse ID').max()['Timepoint']
capomulin_lastvol = pd.DataFrame(capomulin_last)


capomulin_lastmerge = pd.merge(capomulin_lastvol, clean_study_data_complete, on=("Mouse ID","Timepoint"),how="left")
capomulin_lastmerge.head()

In [None]:
ramicane_last = ramicane_df.groupby('Mouse ID').max()['Timepoint']
ramicane_lastvol = pd.DataFrame(ramicane_last)


ramicane_lastmerge = pd.merge(ramicane_lastvol, clean_study_data_complete, on=("Mouse ID","Timepoint"),how="left")
ramicane_lastmerge.head()

In [None]:
infubinol_last = infubinol_df.groupby('Mouse ID').max()['Timepoint']
infubinol_lastvol = pd.DataFrame(infubinol_last)


infubinol_lastmerge = pd.merge(infubinol_lastvol, clean_study_data_complete, on=("Mouse ID","Timepoint"),how="left")
infubinol_lastmerge.head()

In [None]:
ceftamin_last = ceftamin_df.groupby('Mouse ID').max()['Timepoint']
ceftamin_lastvol = pd.DataFrame(ceftamin_last)


ceftamin_lastmerge = pd.merge(ceftamin_lastvol, clean_study_data_complete, on=("Mouse ID","Timepoint"),how="left")
ceftamin_lastmerge.head()

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers.

#Capomulin IQR and tumor volume
captumors = capomulin_lastmerge["Tumor Volume (mm3)"]

cap_quartiles = captumors.quantile([.25,.5,.75])
cap_lower = cap_quartiles[0.25]
cap_upper = cap_quartiles[0.75]
cap_iqr = cap_upper-cap_lower

cap_lower_bound = round(cap_lower - (1.5*cap_iqr),2)
cap_upper_bound = round(cap_upper + (1.5*cap_iqr),2)

#Ramicane IQR and tumor volume
ramtumors = ramicane_lastmerge["Tumor Volume (mm3)"]

ram_quartiles = ramtumors.quantile([.25,.5,.75])
ram_lower = ram_quartiles[0.25]
ram_upper = ram_quartiles[0.75]
ram_iqr = ram_upper-ram_lower

ram_lower_bound = round(ram_lower - (1.5*ram_iqr),2)
ram_upper_bound = round(ram_upper + (1.5*ram_iqr),2)

#Infubinol IQR and tumor volume
inftumors = infubinol_lastmerge["Tumor Volume (mm3)"]

inf_quartiles = inftumors.quantile([.25,.5,.75])
inf_lower = inf_quartiles[0.25]
inf_upper = inf_quartiles[0.75]
inf_iqr = inf_upper-inf_lower

inf_lower_bound = round(inf_lower - (1.5*inf_iqr),2)
inf_upper_bound = round(inf_upper + (1.5*inf_iqr),2)

#Ceftamin IQR and tumor volume
ceftumors = ceftamin_lastmerge["Tumor Volume (mm3)"]

cef_quartiles = ceftumors.quantile([.25,.5,.75])
cef_lower = cef_quartiles[0.25]
cef_upper = cef_quartiles[0.75]
cef_iqr = cef_upper-cef_lower

cef_lower_bound = round(cef_lower - (1.5*cef_iqr),2)
cef_upper_bound = round(cef_upper + (1.5*cef_iqr),2)
 
    # Determine outliers using upper and lower bounds
print(f"The Capomulin outliers are values below {cap_lower_bound} and above {cap_upper_bound}.")
print(f"The Ramicane outliers are values below {ram_lower_bound} and above {ram_upper_bound}.")
print(f"The Infubinol outliers are values below {inf_lower_bound} and above {inf_upper_bound}.")
print(f"The Ceftamin outliers are values below {cef_lower_bound} and above {cef_upper_bound}.")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
tumor_vol = [captumors, ramtumors, inftumors, ceftumors]

fig1, ax1 = plt.subplots()
ax1.set_title('Tumors Volume per Regimen')
ax1.set_ylabel( 'Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')

ax1.boxplot(tumor_vol, labels=["Capomulin","Ramicane","Infubinol","Ceftamin",])

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

#This is a line graph for all mice on the Capomulin regimen
linex_axis = capomulin_df["Timepoint"]
tumvol = capomulin_df["Tumor Volume (mm3)"]


plt.plot(linex_axis, tumvol,linewidth=2, markersize=12)
plt.title('Capomulin Tumor Growth')
plt.xlabel('Timepoint (Days)')
plt.ylabel('Tumor Volume (mm3)')

plt.savefig('linechart')
plt.show()


#This is a line graph for one mice on the Capomulin regimen
line_df = capomulin_df.loc[capomulin_df["Mouse ID"] == "b128",:]

x_axis1 = line_df["Timepoint"]
tumvol1 = line_df["Tumor Volume (mm3)"]

plt.title('Capomulin Tumor Growth: Mouse B128')
plt.plot(x_axis1, tumvol1,linewidth=2, markersize=12)
plt.xlabel('Timepoint (Days)')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
mouse_weight = capomulin_df.groupby(capomulin_df["Mouse ID"])["Weight (g)"].mean()
tumor_volume = capomulin_df.groupby(capomulin_df["Mouse ID"])["Tumor Volume (mm3)"].mean()

# Create Scatter Plot with values calculated above
plt.scatter(mouse_weight,tumor_volume)
plt.xlabel("Weight of Mouse")
plt.ylabel("Tumor Volume")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 

# Caculate correlation coefficient
correlation = round(st.pearsonr(mouse_weight,tumor_volume)[0],2)
print(f'The correlation between weight and tumor value is {correlation}')

# Perform a linear regression on year versus violent crime rate
slope, int, r, p, std_err = st.linregress(mouse_weight, tumor_volume)
      
# Create equation of the slope
y_value = slope * mouse_weight + int

# Plot the linear model on top of scatter plot 
plt.scatter(mouse_weight,tumor_volume)
plt.xlabel("Weight of Mouse")
plt.ylabel("Tumor Volume")
plt.plot(mouse_weight,y_value,color = 'g')
plt.xticks(mouse_weight, rotation=90)
plt.show()


