## Observations and Insights

## Dependencies and starter code

In [1]:
%matplotlib notebook

In [2]:
## Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np


In [3]:
# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"


In [4]:
# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

In [5]:
#Combine data together: 

merged_data_df=pd.merge(mouse_metadata,study_results, how = "outer", on = "Mouse ID")



In [6]:
merged_data_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


## Summary statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and 
#SEM of the tumor volume for each regimen

#I created a groupby to get the data sorted into categories by each Drug Regiment so that I can now run statistical 
#analysis on

summary_stats = pd.DataFrame(merged_data_df.groupby("Drug Regimen").count())


In [35]:
Mean=np.mean(summary_stats)
# take values and make into a column
print(Mean.tolist())

[189.3, 189.3, 189.3, 189.3, 189.3, 189.3, 189.3]


In [174]:

#summarystats = summarystats.rename(columns = {"Mouse ID" : "Trials"})

#summarystats.head()

Mean = pd.DataFrame(merged_data_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"] ).mean())
Mean_= Mean.rename(columns={"Tumor Volume (mm3)":"Tumor Volume Mean"})

In [175]:
Median = pd.DataFrame(merged_data_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"] ).median())

In [176]:
Median_= Median.rename(columns={"Tumor Volume (mm3)":"Tumor Volume Median"})

In [177]:
Standard_deviation= pd.DataFrame(merged_data_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"] ).std())

In [178]:
Standard_deviation_=Standard_deviation.rename(columns={"Tumor Volume (mm3)":"Tumor Volume SD"})

In [179]:
Variance=pd.DataFrame(merged_data_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"] ).var())

In [180]:
Variance_= Variance.rename(columns={"Tumor Volume (mm3)":"Tumor Volume Variance"})

In [181]:
SEM=pd.DataFrame(merged_data_df[["Drug Regimen","Tumor Volume (mm3)"]].groupby(["Drug Regimen"] ).sem())

In [184]:
SEM_= SEM.rename(columns={"Tumor Volume (mm3)":"Tumor Volume SEM"})

In [187]:
all_=[Mean_,Median_,Standard_deviation_,Variance_,SEM_]

In [189]:
all_

[              Tumor Volume Mean
 Drug Regimen                   
 Capomulin             40.675741
 Ceftamin              52.591172
 Infubinol             52.884795
 Ketapril              55.235638
 Naftisol              54.331565
 Placebo               54.033581
 Propriva              52.322552
 Ramicane              40.216745
 Stelasyn              54.233149
 Zoniferol             53.236507,               Tumor Volume Median
 Drug Regimen                     
 Capomulin               41.557809
 Ceftamin                51.776157
 Infubinol               51.820584
 Ketapril                53.698743
 Naftisol                52.509285
 Placebo                 52.288934
 Propriva                50.854632
 Ramicane                40.673236
 Stelasyn                52.431737
 Zoniferol               51.818479,               Tumor Volume SD
 Drug Regimen                 
 Capomulin            4.994774
 Ceftamin             6.268188
 Infubinol            6.567243
 Ketapril             8.27970

In [149]:
all_df=[Mean,Median,Standard_deviation,Variance,SEM]

In [42]:
# By doing a .describe I am able to get a summary of statistics (mean, median, variance, standard deviation) for each regimen 
summary_stats.describe()

Unnamed: 0,Mouse ID,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,189.3,189.3,189.3,189.3,189.3,189.3,189.3
std,22.146231,22.146231,22.146231,22.146231,22.146231,22.146231,22.146231
min,161.0,161.0,161.0,161.0,161.0,161.0,161.0
25%,178.75,178.75,178.75,178.75,178.75,178.75,178.75
50%,181.5,181.5,181.5,181.5,181.5,181.5,181.5
75%,187.5,187.5,187.5,187.5,187.5,187.5,187.5
max,230.0,230.0,230.0,230.0,230.0,230.0,230.0


## Bar plots

In [None]:
x = ['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas 

#First, I counted the datapoints for each treatment regimen and called it my y for the y-axis

y=merged_data_df['Drug Regimen'].value_counts()

In [None]:
y

In [None]:
#creating this bar chart through pandas allows us to see the data points/amounts for each treatment regimen

merged_data_df['Drug Regimen'].value_counts().plot.bar()

In [None]:
plt.show()

In [None]:
np.arange(0,len('Drug Regimen'))

In [None]:
def plot_bar_x():
    index = ['Capomulin','Ramicane','Ketapril','Naftisol','Zoniferol','Placebo','Stelasyn','Infubinol','Ceftamin','Propriva']
    y_axis = [230,228,188,186,182,181,181,178,178,161]
    plt.bar(index,y_axis)
    plt.xticks(y_axis,index)
    plt.title("The Amount of Drug Regimen")
    plt.xlabel("Drug Regimen")
    plt.ylabel("The Amount per Drug Regimen")
    plt.show()

In [None]:
#plot_bar_x()

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
merged_data_df['Sex'].value_counts()

In [None]:
merged_data_df['Sex'].value_counts().plot.pie()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
def pie_plot(): 
    labels = ["Females", "Males"]
    counts= [935,958]
    colors = ["pink","blue"]
    plt.title("Female Mice vs. Male Mice")
    plt.pie(counts,labels=labels,colors=colors,shadow=True,startangle=90)
    plt.axis('equal')
    plt.show()
    

In [None]:
pie_plot()

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 

#Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [None]:
filtered_df = merged_data_df.loc[(["Drug Regimen"] == "Capomulin")| (merged_data_df["Drug Regimen"]=="Ramicane") 
| (merged_data_df["Drug Regimen"] == "Infubinol") | (merged_data_df["Drug Regimen"] == "Ceftamin"), :]

In [None]:
filtered_df = filtered_df.sort_values("Timepoint", ascending = False)

In [None]:
# getting rid of any duplicates
filtered_df = filtered_df.drop_duplicates(subset="Mouse ID", keep='first')

In [None]:
# Determine quartiles
quartiles = filtered_df['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

In [None]:
#upper and lower bounds
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

In [None]:
#outliers
outliers_df = filtered_df.loc[(filtered_df['Tumor Volume (mm3)'] > upper_bound) | (filtered_df['Tumor Volume (mm3)'] < lower_bound), :]
outliers_df

In [None]:
# I did not see any outliers

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

Tumor_Volume = filtered_df['Tumor Volume (mm3)']
fig1, ax1 = plt.subplots()
ax1.set_title('Tumor Volume of Mice')
ax1.set_ylabel('Tumor Volume')
ax1.boxplot(Tumor_Volume)
plt.show()

## Line and scatter plots

In [13]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
Capomulin_df = merged_data_df.loc[(merged_data_df["Drug Regimen"] == "Capomulin"),:]

In [14]:
Capomulin_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
10,s185,Capomulin,Female,3,17,0,45.0,0
11,s185,Capomulin,Female,3,17,5,43.878496,0
12,s185,Capomulin,Female,3,17,10,37.614948,0
13,s185,Capomulin,Female,3,17,15,38.177232,0
14,s185,Capomulin,Female,3,17,20,36.866876,0


In [15]:
timepoint = Capomulin_df["Timepoint"]
tumor_volume = Capomulin_df["Tumor Volume (mm3)"]

In [16]:
def line_plot():
    plt.xlabel('Timepoint')
    plt.ylabel('Tumor Volume')
    plt.title('Tumor Volume over Time for Capomulin Mice')
    plt.show()

In [17]:
line_plot()

<IPython.core.display.Javascript object>

In [18]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regime
scatter= merged_data_df.plot(kind='scatter',x='Weight (g)',y='Tumor Volume (mm3)',color='red')
plt.show()

<IPython.core.display.Javascript object>

In [19]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume 
#for the Capomulin regimen

In [20]:

# values for x and y values
mouse_weight = Capomulin_df.groupby(Capomulin_df["Mouse ID"])["Weight (g)"].mean()
tumor_volume = Capomulin_df.groupby(Capomulin_df["Mouse ID"])["Tumor Volume (mm3)"].mean()

# linear regression o
slope, int, r, p, std_err = st.linregress(mouse_weight, tumor_volume)
      

fit = slope * mouse_weight + int


In [21]:
# Plot the linear model on top of scatter plot 
def cor_graph():
    plt.scatter(mouse_weight,tumor_volume)
    plt.xlabel("Weight of Mouse")
    plt.ylabel("Tumor Volume")
    plt.plot(mouse_weight,fit,"--")
    plt.xticks(mouse_weight, rotation=90)
    plt.show()

In [22]:
cor_graph()