## Observations and Insights 

In [33]:
%matplotlib notebook
%time
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path, low_memory=False)
study_results = pd.read_csv(study_results_path, low_memory=False)

# Combine the data into a single dataset
mouse_study = pd.merge(mouse_metadata, study_results, how='left', on=['Mouse ID','Mouse ID'])
mouse_study.head()

Wall time: 0 ns


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [34]:
# Checking the number of mice in the DataFrame.
mouse_count = mouse_study['Mouse ID'].nunique()
mouse_count

249

In [35]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_study2 = mouse_study.assign(
    is_duplicate= lambda d: d.duplicated(
    )).sort_values(['Mouse ID','Timepoint']).reset_index(drop=True) 
mouse_study2

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,is_duplicate
0,a203,Infubinol,Female,20,23,0,45.000000,0,False
1,a203,Infubinol,Female,20,23,5,48.508468,0,False
2,a203,Infubinol,Female,20,23,10,51.852437,1,False
3,a203,Infubinol,Female,20,23,15,52.777870,1,False
4,a203,Infubinol,Female,20,23,20,55.173336,1,False
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,False
1889,z969,Naftisol,Male,9,30,30,65.841013,3,False
1890,z969,Naftisol,Male,9,30,35,69.176246,4,False
1891,z969,Naftisol,Male,9,30,40,70.314904,4,False


In [36]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_study[mouse_study.duplicated()]

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0


In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_df = mouse_study.drop_duplicates()
mouse_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [6]:
# Checking the number of mice in the clean DataFrame.
mice = mouse_df['Mouse ID'].nunique()
mice #number of mice doesn't change - just the data points, overall...
print(f'There are {mice} mice.')

There are 249 mice.


## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.

regimen_gb = mouse_df.groupby(['Drug Regimen'])

tumor_mean = regimen_gb['Tumor Volume (mm3)'].mean()
tumor_median = regimen_gb['Tumor Volume (mm3)'].median()
tumor_var = regimen_gb['Tumor Volume (mm3)'].var()
tumor_std = regimen_gb['Tumor Volume (mm3)'].std()
tumor_sem = regimen_gb['Tumor Volume (mm3)'].sem()

mouse_study_df = pd.DataFrame({
                        'Mean': tumor_mean,
                        'Median': tumor_median,
                        'Variance': tumor_var,
                        'Std Dev': tumor_std,
                        'SEM': tumor_sem
                               })
mouse_study_df.style.format("{:,.2f}")
mouse_study_df= mouse_study_df.reset_index()
mouse_study_df

Unnamed: 0,Drug Regimen,Mean,Median,Variance,Std Dev,SEM
0,Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
1,Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
2,Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
3,Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
4,Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
5,Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
6,Propriva,52.368318,50.909965,42.27809,6.50216,0.514041
7,Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
8,Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
9,Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar Plots

In [8]:
# Generate a bar plot showing the number of mice per time point for 
# each treatment throughout the course of the study using pandas. 
plt.figure()
panda_bar = regimen_gb.size().sort_values(ascending=False).plot.bar(rot = 25,
                                                                    color='g', 
                                                                    title='No of Mice per Timepoint By Treatment')
xtickangle=45
panda_bar.set_xlabel("Drug Regimen Tested")
panda_bar.set_ylabel("Number of Time Points")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [9]:
# # Generate a bar plot showing the number of mice per time point for 
# # each treatment throughout the course of the study using pyplot.
plt.figure(figsize=(8.5,4))
axis = mouse_df.groupby(['Drug Regimen']).size().sort_values(ascending=False)
drug_list = axis.index[0:10]
drug_data = np.arange(len(drug_list))
plt.bar(drug_data, axis, color='g', align='center', width=0.5, tick_label=drug_list)
plt.xticks= (drug_data,(drug_list))
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

## Pie Plots

In [10]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas.
by_sex = mouse_df.groupby(['Sex'])
mice_values = by_sex['Mouse ID'].nunique()
sex_df = pd.DataFrame({'Mice ID': mice_values,},index=['Male','Female'])

sex_labels = ['Male', 'Female']
colors = ["green", "lightgreen"]
explode = (0.01, 0)

sex_df.plot(y='Mice ID',kind='pie',title='Gender Distribution',
            labels=sex_labels,autopct='%1.1f%%', colors=colors,explode=explode)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1c7b3a42188>

In [11]:
mice_val = mouse_df.drop_duplicates('Mouse ID')
mice_val2 = mice_val['Sex'].value_counts()

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.figure()
labels_py=['Male','Female']
pyplot_pie = plt.pie(mice_val2, explode=explode, labels=labels_py, autopct='%1.1f%%',
        shadow=True,colors=colors)
plt.axis('equal')
plt.title('Gender Distribution')
plt.show()

<IPython.core.display.Javascript object>

## Quartiles, Outliers and Boxplots

In [32]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. 
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
# TEST FOR SURVIVING MICE?
mouse_df = mouse_df.sort_values(by = 'Tumor Volume (mm3)', ascending=True)
mouse_df2 = mouse_df[['Mouse ID','Drug Regimen','Timepoint','Tumor Volume (mm3)']]
test = mouse_df2.groupby(['Mouse ID']).max()
test=test.reset_index()
test

Wall time: 0 ns


In [14]:
test_mrg=test[['Mouse ID','Timepoint']].merge(mouse_df2,on=['Mouse ID','Timepoint'],how='left')
test_mrg

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Tumor Volume (mm3)
0,a203,45,Infubinol,67.973419
1,a251,45,Infubinol,65.525743
2,a262,45,Placebo,70.717621
3,a275,45,Ceftamin,62.999356
4,a366,30,Stelasyn,63.440686
...,...,...,...,...
244,z435,10,Propriva,48.710661
245,z578,45,Ramicane,30.638696
246,z581,45,Infubinol,62.754451
247,z795,45,Naftisol,65.741070


In [15]:
cap_test = test_mrg.loc[(test_mrg['Drug Regimen'] == 'Capomulin')]
cap_test

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Tumor Volume (mm3)
19,b128,45,Capomulin,38.982878
24,b742,45,Capomulin,38.939633
61,f966,20,Capomulin,30.485985
64,g288,45,Capomulin,37.074024
66,g316,45,Capomulin,40.15922
81,i557,45,Capomulin,47.685963
84,i738,45,Capomulin,37.311846
86,j119,45,Capomulin,38.125164
88,j246,35,Capomulin,38.753265
109,l509,45,Capomulin,41.483008


In [16]:
ram_test = test_mrg.loc[(test_mrg['Drug Regimen'] == 'Ramicane')]
ram_test.head()

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Tumor Volume (mm3)
6,a411,45,Ramicane,38.407618
7,a444,45,Ramicane,43.047543
10,a520,45,Ramicane,38.810366
12,a644,45,Ramicane,32.978522
33,c458,30,Ramicane,38.342008


In [17]:
ket_test = test_mrg.loc[(test_mrg['Drug Regimen'] == 'Ketapril')]
ket_test.head()

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Tumor Volume (mm3)
8,a457,10,Ketapril,49.783419
35,c580,30,Ketapril,58.046569
39,c819,40,Ketapril,62.175705
40,c832,45,Ketapril,65.415955
46,d474,40,Ketapril,60.233733


In [18]:
naf_test = test_mrg.loc[(test_mrg['Drug Regimen'] == 'Naftisol')]
naf_test.head()

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Tumor Volume (mm3)
16,a818,45,Naftisol,74.997764
23,b559,45,Naftisol,73.051363
47,e213,45,Naftisol,63.105696
49,e291,25,Naftisol,55.138953
51,e584,45,Naftisol,68.359777


In [19]:
quartiles = cap_test['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"Capomulin Test:")
print(f"The lower quartile for is: {lowerq}")
print(f"The upper quartile is: {upperq}")
print(f"The interquartile range is: {iqr}")
print(f"The the median is: {quartiles[0.5]}")

Capomulin Test:
The lower quartile for is: 32.37735684
The upper quartile is: 40.1592203
The interquartile range is: 7.781863460000004
The the median is: 38.125164399999996


In [20]:
quartiles = ram_test['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"Ramicane Test:")
print(f"The lower quartile is: {lowerq}")
print(f"The upper quartile is: {upperq}")
print(f"The interquartile range is: {iqr}")
print(f"The the median is: {quartiles[0.5]}")

Ramicane Test:
The lower quartile is: 31.56046955
The upper quartile is: 40.65900627
The interquartile range is: 9.098536719999998
The the median is: 36.56165229


In [21]:
quartiles = ket_test['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"Ketapril Test:")
print(f"The lower quartile is: {lowerq}")
print(f"The upper quartile is: {upperq}")
print(f"The interquartile range is: {iqr}")
print(f"The the median is: {quartiles[0.5]}")

Ketapril Test:
The lower quartile is: 56.72009545
The upper quartile is: 69.87225079
The interquartile range is: 13.152155339999993
The the median is: 64.48781246


In [22]:
quartiles = naf_test['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"Naftisol Test:")
print(f"The lower quartile is: {lowerq}")
print(f"The upper quartile is: {upperq}")
print(f"The interquartile range is: {iqr}")
print(f"The the median is: {quartiles[0.5]}")

Naftisol Test:
The lower quartile is: 52.07951009
The upper quartile is: 69.56362076
The interquartile range is: 17.484110670000007
The the median is: 63.28328821


In [23]:
plt.figure()
plt.boxplot([cap_test['Tumor Volume (mm3)'], 
             ram_test['Tumor Volume (mm3)'],
             ket_test['Tumor Volume (mm3)'],
             naf_test['Tumor Volume (mm3)']],
            labels=['Capomulin','Ramicane','Ketapril','Naftisol'])
plt.show()

<IPython.core.display.Javascript object>

## Line and Scatter Plots

In [24]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
plt.figure()
mouse_df2 = mouse_df2.sort_values(by=['Timepoint'])
mouse_line = mouse_df2.loc[(mouse_df2['Mouse ID'] == 'l509') & (mouse_df2['Drug Regimen'] == 'Capomulin')]
mouse_line
x_axis = np.arange(0,50,5)
y_axis = mouse_line['Tumor Volume (mm3)']
mouse_line_chart, = plt.plot(x_axis, y_axis, marker=".",color="blue", linewidth=1, label="Mouse ID l509")
plt.legend(handles=[mouse_line_chart], loc="best")
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Tumor Volume (mm3)')

In [25]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
rng = np.random.RandomState(0)
m_scatter = mouse_df.loc[(mouse_df['Drug Regimen'] == 'Capomulin')]
ave_tumor = m_scatter.groupby('Mouse ID')['Tumor Volume (mm3)'].mean().to_frame()
m_weights = m_scatter[['Mouse ID','Weight (g)']]
scatter_merge =  ave_tumor.merge(m_weights,on=['Mouse ID'],how='inner')
scatter_fin = scatter_merge.drop_duplicates()

fig, ax1 = plt.subplots()
ax1.set_ylabel('Ave Tumor Volume (mm3)')
ax1.set_xlabel('Mouse Weight (g)')
colors = 'green'
plt.scatter(scatter_fin['Weight (g)'], scatter_fin['Tumor Volume (mm3)'], c=colors, alpha=0.75,
            cmap='viridis')
plt.title('Mouse Weight v Average Tumor Volume')
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

## Correlation and Regression

In [26]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

In [27]:
weight = scatter_fin.iloc[:,2]
ave_tumor_size = scatter_fin.iloc[:,1]
corr = st.pearsonr(weight,ave_tumor_size)
print(f"The correlation between both factors is {round(corr[0],2)}")

The correlation between both factors is 0.84


In [28]:
plt.figure()
x_val = scatter_fin['Weight (g)']
y_val = scatter_fin['Tumor Volume (mm3)']

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_val, y_val)
regress_values = x_val * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_val,y_val)
plt.plot(x_val,regress_values,"r-")
plt.annotate(line_eq,(0,50),fontsize=15,color="red")
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
print(f"The r-squared is: {rvalue**2}")
plt.show()

<IPython.core.display.Javascript object>

The r-squared is: 0.708856804770873


[]

Wall time: 0 ns
