In [None]:
# Observations:
# There is a positive correlation between mouse weight and tumor volume
# Ketapril was the least effective regimen
# Ramicane was the most effective regimen

In [74]:
%matplotlib notebook

In [75]:
# Dependencies and Setup
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np

# Study data files
mouse_metadata = " Mouse_metadata.csv"
study_results = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
new_table = pd.merge(mouse_metadata, study_results, on="Mouse ID")
merge_table = new_table.groupby('Drug Regimen')
merge_table.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
10,s185,Capomulin,Female,3,17,0,45.0,0
11,s185,Capomulin,Female,3,17,5,43.878496,0
12,s185,Capomulin,Female,3,17,10,37.614948,0
13,s185,Capomulin,Female,3,17,15,38.177232,0
14,s185,Capomulin,Female,3,17,20,36.866876,0


In [76]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
mean_df = merge_table["Tumor Volume (mm3)"].mean()
median_df = merge_table["Tumor Volume (mm3)"].median()
var_df = merge_table["Tumor Volume (mm3)"].var()
std_df = merge_table["Tumor Volume (mm3)"].std()
sem_df = merge_table["Tumor Volume (mm3)"].sem()
merge = pd.merge(mean_df, median_df, on="Drug Regimen")
merge2 = pd.merge(merge, var_df, on="Drug Regimen")
merge3 = pd.merge(merge2, std_df, on="Drug Regimen")
summary = pd.merge(merge3, sem_df, on="Drug Regimen")



summary.columns = ['Mean Tumor Vol.','Median Tumor Vol.','Tumor Vol. Variance','Tumor Vol. Standard Deviation','SEM of Drugs']
summary



Unnamed: 0_level_0,Mean Tumor Vol.,Median Tumor Vol.,Tumor Vol. Variance,Tumor Vol. Standard Deviation,SEM of Drugs
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.322552,50.854632,42.35107,6.50777,0.512884
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [149]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas 
multi_plot = summary.plot(kind="bar", figsize=(7,10))

plt.title("Drug Regimen Statistics")
plt.ylabel("Tumor Vol.")

plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [148]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot 
x_axis = np.arange(len(summary))
tick_locations = [value for value in x_axis] 
w = .1

plt.figure(figsize=(7,10))
plt.bar(x_axis, summary["Mean Tumor Vol."], w, color='tab:blue', alpha=1, align="center", label="Mean Tumor Vol.")
plt.bar(x_axis+w, summary["Median Tumor Vol."], w, color='tab:orange', alpha=1, align="center", label="Median Tumor Vol.")
plt.bar(x_axis+w*2, summary["Tumor Vol. Variance"], w, color='tab:green', alpha=1, align="center", label="Tumor Vol. Variance")
plt.bar(x_axis+w*3, summary["Tumor Vol. Standard Deviation"], w, color='tab:red', alpha=1, align="center", label="Tumor Vol. Standard Deviation")
plt.bar(x_axis+w*4, summary["SEM of Drugs"], w, color='tab:purple', alpha=1, align="center", label="SEM of Drugs")

plt.xlim(-0.25, len(x_axis)-0.25)
plt.ylim(0, max(summary["Tumor Vol. Variance"])+4)
plt.title("Drug Regimen Statistics")
plt.xlabel("Drug Regimen")
plt.ylabel("Tumor Vol.")
plt.legend(loc="upper right")

plt.xticks(tick_locations, summary.index, rotation="vertical")

plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [128]:
# Generate a pie plot showing the distribution of female versus male mice using pandas 
df = pd.DataFrame({'Sex': [958, 935],},
                  index=['Male', 'Female',])
plot = df.plot.pie(y="Sex", figsize=(5, 5), autopct="%1.1f%%", startangle=140, legend=False)
plt.axis("equal")
plt.title("Male Vs. Female")



<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Male Vs. Female')

In [127]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot 
sex = new_table.groupby('Sex')

labels = ["Male", "Female"]
sizes = [958, 935]
colors = ["steelblue", "darkorange"]
explode = (0, 0,)

plt.title("Male Vs. Female")
plt.pie(sizes, explode=explode, labels=labels, colors=colors, 
        autopct="%1.1f%%", shadow=False, startangle=140)

plt.axis("equal")
plt.show()


<IPython.core.display.Javascript object>

In [85]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers.
last = new_table.loc[new_table["Timepoint"] == 45, ["Mouse ID", "Drug Regimen", "Tumor Volume (mm3)"]]
drug = last.loc[(last["Drug Regimen"] == "Capomulin") | (last["Drug Regimen"] == "Ramicane") | (last["Drug Regimen"] == "Infubinol") | (last["Drug Regimen"] == "Ceftamin"), :]
drug.drop_duplicates(subset ="Mouse ID", 
                     keep = False, inplace = True)
drug_merge = drug.groupby('Drug Regimen')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [86]:
capo = drug.loc[drug["Drug Regimen"] == "Capomulin", :]

quartiles = capo['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
print('Capomulin IQR')
print(f"The lower quartile of tumor volume is: {lowerq}")
print(f"The upper quartile of tumor volume is: {upperq}")
print(f"The interquartile range of tumor volume is: {iqr}")
print(f"The the median of tumor volume is: {quartiles[0.5]} ")
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

Capomulin IQR
The lower quartile of tumor volume is: 32.37735684
The upper quartile of tumor volume is: 40.1592203
The interquartile range of tumor volume is: 7.781863460000004
The the median of tumor volume is: 37.31184577 
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.


In [87]:
rami = drug.loc[drug["Drug Regimen"] == "Ramicane", :]

quartiles = rami['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
print('Ramicane IQR')
print(f"The lower quartile of tumor volume is: {lowerq}")
print(f"The upper quartile of tumor volume is: {upperq}")
print(f"The interquartile range of tumor volume is: {iqr}")
print(f"The the median of tumor volume is: {quartiles[0.5]} ")
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

Ramicane IQR
The lower quartile of tumor volume is: 30.981175224999998
The upper quartile of tumor volume is: 38.508305307499995
The interquartile range of tumor volume is: 7.527130082499998
The the median of tumor volume is: 34.848627300000004 
Values below 19.690480101250003 could be outliers.
Values above 49.79900043124999 could be outliers.


In [88]:
infu = drug.loc[drug["Drug Regimen"] == "Infubinol", :]

quartiles = infu['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
print('Infubinol IQR')
print(f"The lower quartile of tumor volume is: {lowerq}")
print(f"The upper quartile of tumor volume is: {upperq}")
print(f"The interquartile range of tumor volume is: {iqr}")
print(f"The the median of tumor volume is: {quartiles[0.5]} ")
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

Infubinol IQR
The lower quartile of tumor volume is: 62.75445141
The upper quartile of tumor volume is: 67.68556862
The interquartile range of tumor volume is: 4.9311172099999965
The the median of tumor volume is: 66.08306589 
Values below 55.35777559500001 could be outliers.
Values above 75.08224443499999 could be outliers.


In [89]:
ceft = drug.loc[drug["Drug Regimen"] == "Ceftamin", :]

quartiles = ceft['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
print('Ceftamin IQR')
print(f"The lower quartile of tumor volume is: {lowerq}")
print(f"The upper quartile of tumor volume is: {upperq}")
print(f"The interquartile range of tumor volume is: {iqr}")
print(f"The the median of tumor volume is: {quartiles[0.5]} ")
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

Ceftamin IQR
The lower quartile of tumor volume is: 61.43389223
The upper quartile of tumor volume is: 67.52748237
The interquartile range of tumor volume is: 6.093590140000003
The the median of tumor volume is: 64.29983003 
Values below 52.29350701999999 could be outliers.
Values above 76.66786758 could be outliers.


In [95]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest 
tumor = drug['Tumor Volume (mm3)']

fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume')
ax1.set_ylabel('Volume (mm3)')
ax1.boxplot(tumor)
plt.show()

<IPython.core.display.Javascript object>

In [112]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin 
line = new_table.loc[new_table["Mouse ID"] == "s185", ["Mouse ID", "Timepoint", "Drug Regimen", "Tumor Volume (mm3)"]]
line

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Tumor Volume (mm3)
10,s185,0,Capomulin,45.0
11,s185,5,Capomulin,43.878496
12,s185,10,Capomulin,37.614948
13,s185,15,Capomulin,38.177232
14,s185,20,Capomulin,36.866876
15,s185,25,Capomulin,33.94994
16,s185,30,Capomulin,32.959671
17,s185,35,Capomulin,28.328531
18,s185,40,Capomulin,25.472143
19,s185,45,Capomulin,23.343598


In [113]:
x_axis = np.arange(0,46,5)
x_axis

array([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45])

In [114]:
points = line["Tumor Volume (mm3)"]
points

10    45.000000
11    43.878496
12    37.614948
13    38.177232
14    36.866876
15    33.949940
16    32.959671
17    28.328531
18    25.472143
19    23.343598
Name: Tumor Volume (mm3), dtype: float64

In [115]:
tumor_volume, = plt.plot(x_axis, points, marker="+",color="blue", linewidth=1)
plt.title("s185 Data")
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")


<IPython.core.display.Javascript object>

Text(0, 0.5, 'Tumor Volume (mm3)')

In [100]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen 
scatter = new_table.loc[new_table["Drug Regimen"] == "Capomulin", ["Weight (g)", "Drug Regimen", "Tumor Volume (mm3)"]]
scatter = scatter.groupby("Weight (g)")
scatter.mean()

Unnamed: 0_level_0,Tumor Volume (mm3)
Weight (g),Unnamed: 1_level_1
15,36.18204
17,37.214133
19,41.182391
20,39.141053
21,42.0887
22,43.28849
23,43.341051
24,44.80581
25,44.062109


In [150]:
weight = [15, 17, 19, 20, 21, 22, 23, 24, 25]
volume = [36.182040, 37.214133, 41.182391, 39.141053, 42.088700, 43.288490, 43.341051, 44.805810, 44.062109]

plt.scatter(weight, volume, marker="o", facecolors="red", edgecolors="black")
plt.ylim(36, 45)
plt.xlim(14.9, 25.9)
plt.title("Capomulim: Mouse Weight vs. Average Tumor Volume")
plt.xlabel("Weight (g)")
plt.ylabel("Volume (mm3)")


<IPython.core.display.Javascript object>

Text(0, 0.5, 'Volume (mm3)')

In [105]:
regression = pd.DataFrame({
    "Weight (g)": [15, 17, 19, 20, 21, 22, 23, 24, 25],
    "Tumor Volume (mm3)": [36.182040, 37.214133, 41.182391, 39.141053, 42.088700, 43.288490, 43.341051, 44.805810, 44.062109]
})

regression



Unnamed: 0,Weight (g),Tumor Volume (mm3)
0,15,36.18204
1,17,37.214133
2,19,41.182391
3,20,39.141053
4,21,42.0887
5,22,43.28849
6,23,43.341051
7,24,44.80581
8,25,44.062109


In [109]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen
from scipy.stats import linregress
from sklearn import datasets

x_values = regression['Weight (g)']
y_values = regression['Tumor Volume (mm3)']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(20,39.141053),fontsize=15,color="red")
plt.ylim(36, 45)
plt.xlim(14.9, 25.9)
plt.title("Capomulim: Mouse Weight vs. Average Tumor Volume")
plt.xlabel("Weight (g)")
plt.ylabel("Volume (mm3)")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Volume (mm3)')