## Observations and Insights

## Dependencies and starter code

In [116]:
%matplotlib notebook

In [117]:
## Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np


In [118]:
# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"


In [119]:
# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

In [120]:
#Combine data together: 

merged_data_df=pd.merge(mouse_metadata,study_results, how = "outer", on = "Mouse ID")



In [121]:
merged_data_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


## Summary statistics

In [122]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and 
#SEM of the tumor volume for each regimen

#I created a groupby to get the data sorted into categories by each Drug Regiment so that I can now run statistical 
#analysis on

summary_stats = pd.DataFrame(merged_data_df.groupby("Drug Regimen").count())


In [123]:
#summary_stats["Mean"] = pd.DataFrame(merged_data_df.groupby(['Drug Regimen','Tumor Volume (mm3)']).mean())
#summary_stats["Median"] = pd.DataFrame(merged_data_df.groupby(["Drug Regimen","Tumor Volume (mm3)"]).median())
#summary_stats["Standard Deviation"] = pd.DataFrame(merged_data_df.groupby(["Drug Regimen","Tumor Volume (mm3)"]).std())
#summary_stats["Variance"] = pd.DataFrame(merged_data_df.groupby(["Drug Regimen","Tumor Volume (mm3)"]).var())
#summary_stats["SEM"] = pd.DataFrame(merged_data_df.groupby(["Drug Regimen","Tumor Volume (mm3)"]).sem())
#summary_stats = summary_stats[["Mouse ID", "Mean", "Median", "Standard Deviation", "Variance", "SEM"]]

#Rename count column
#summarystats = summarystats.rename(columns = {"Mouse ID" : "Trials"})

#summarystats.head()

In [124]:
dict(list (summary_stats))

ValueError: dictionary update sequence element #0 has length 8; 2 is required

In [125]:
#Gives group list of treatments
summary_stats.groups.keys()

AttributeError: 'DataFrame' object has no attribute 'groups'

In [126]:
# By doing a .describe I am able to get a summary of statistics (mean, median, variance, standard deviation) for each regimen 
summary_stats.describe()

Unnamed: 0,Mouse ID,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,189.3,189.3,189.3,189.3,189.3,189.3,189.3
std,22.146231,22.146231,22.146231,22.146231,22.146231,22.146231,22.146231
min,161.0,161.0,161.0,161.0,161.0,161.0,161.0
25%,178.75,178.75,178.75,178.75,178.75,178.75,178.75
50%,181.5,181.5,181.5,181.5,181.5,181.5,181.5
75%,187.5,187.5,187.5,187.5,187.5,187.5,187.5
max,230.0,230.0,230.0,230.0,230.0,230.0,230.0


## Bar plots

In [127]:
x = ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']

In [128]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas 

#First, I counted the datapoints for each treatment regimen and called it my y for the y-axis

y=merged_data_df['Drug Regimen'].value_counts()

In [129]:
y

Capomulin    230
Ramicane     228
Ketapril     188
Naftisol     186
Zoniferol    182
Stelasyn     181
Placebo      181
Ceftamin     178
Infubinol    178
Propriva     161
Name: Drug Regimen, dtype: int64

In [130]:
#creating this bar chart through pandas allows us to see the data points/amounts for each treatment regimen

merged_data_df['Drug Regimen'].value_counts().plot.bar()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1a1b551198>

In [133]:
plt.show()

In [134]:
np.arange(0,len('Drug Regimen'))

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [138]:
def plot_bar_x():
    index = ['Capomulin','Ramicane','Ketapril','Naftisol','Zoniferol','Placebo','Stelasyn','Infubinol','Ceftamin','Propriva']
    y_axis = [230,228,188,186,182,181,181,178,178,161]
    plt.bar(index,y_axis)
    plt.xticks(y_axis,index)
    plt.title("The Amount of Drug Regimen")
    plt.xlabel("Drug Regimen")
    plt.ylabel("The Amount per Drug Regimen")
    plt.show()

In [136]:
#plot_bar_x()

## Pie plots

In [60]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
merged_data_df['Sex'].value_counts()

Male      958
Female    935
Name: Sex, dtype: int64

In [61]:
merged_data_df['Sex'].value_counts().plot.pie()


<matplotlib.axes._subplots.AxesSubplot at 0x1a19a23390>

In [139]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
def pie_plot(): 
    labels = ["Females", "Males"]
    counts= [935,958]
    colors = ["pink","blue"]
    plt.title("Female Mice vs. Male Mice")
    plt.pie(counts,labels=labels,colors=colors,shadow=True,startangle=90)
    plt.axis('equal')
    plt.show()
    

In [140]:
pie_plot()

## Quartiles, outliers and boxplots

In [64]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 

#Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [71]:
filtered_df = merged_data_df.loc[(["Drug Regimen"] == "Capomulin")| (merged_data_df["Drug Regimen"]=="Ramicane") 
| (merged_data_df["Drug Regimen"] == "Infubinol") | (merged_data_df["Drug Regimen"] == "Ceftamin"), :]

In [72]:
filtered_df = filtered_df.sort_values("Timepoint", ascending = False)

In [None]:
# getting rid of any duplicates
filtered_df = filtered_df.drop_duplicates(subset="Mouse ID", keep='first')

In [73]:
# Determine quartiles
quartiles = filtered_df['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

In [74]:
#upper and lower bounds
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

In [75]:
#outliers
outliers_df = filtered_df.loc[(filtered_df['Tumor Volume (mm3)'] > upper_bound) | (filtered_df['Tumor Volume (mm3)'] < lower_bound), :]
outliers_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
637,c139,Infubinol,Male,11,28,45,72.226731,2
9,k403,Ramicane,Male,21,16,45,22.050126,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
636,c139,Infubinol,Male,11,28,40,69.428141,2
7,k403,Ramicane,Male,21,16,35,26.546993,1


In [None]:
# I did not see any outliers

In [76]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

Tumor_Volume = filtered_df['Tumor Volume (mm3)']
fig1, ax1 = plt.subplots()
ax1.set_title('Tumor Volume of Mice')
ax1.set_ylabel('Tumor Volume')
ax1.boxplot(Tumor_Volume)
plt.show()

<IPython.core.display.Javascript object>

## Line and scatter plots

In [100]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
Capomulin_df = merged_data_df.loc[(merged_data_df["Drug Regimen"] == "Capomulin"),:]

In [101]:
Capomulin_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
10,s185,Capomulin,Female,3,17,0,45.0,0
11,s185,Capomulin,Female,3,17,5,43.878496,0
12,s185,Capomulin,Female,3,17,10,37.614948,0
13,s185,Capomulin,Female,3,17,15,38.177232,0
14,s185,Capomulin,Female,3,17,20,36.866876,0


In [102]:
timepoint = Capomulin_df["Timepoint"]
tumor_volume = Capomulin_df["Tumor Volume (mm3)"]

In [103]:
def line_plot():
    plt.xlabel('Timepoint')
    plt.ylabel('Tumor Volume')
    plt.title('Tumor Volume over Time for Capomulin Mice')
    plt.show()

In [104]:
line_plot()

<IPython.core.display.Javascript object>

In [100]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regime
scatter= merged_data_df.plot(kind='scatter',x='Weight (g)',y='Tumor Volume (mm3)',color='red')
plt.show()

<IPython.core.display.Javascript object>

In [11]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume 
#for the Capomulin regimen

In [106]:

# values for x and y values
mouse_weight = Capomulin_df.groupby(Capomulin_df["Mouse ID"])["Weight (g)"].mean()
tumor_volume = Capomulin_df.groupby(Capomulin_df["Mouse ID"])["Tumor Volume (mm3)"].mean()

# linear regression o
slope, int, r, p, std_err = st.linregress(mouse_weight, tumor_volume)
      

fit = slope * mouse_weight + int


In [110]:
# Plot the linear model on top of scatter plot 
def cor_graph():
    plt.scatter(mouse_weight,tumor_volume)
    plt.xlabel("Weight of Mouse")
    plt.ylabel("Tumor Volume")
    plt.plot(mouse_weight,fit,"--")
    plt.xticks(mouse_weight, rotation=90)
    plt.show()

In [115]:
cor_graph()