# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

Three observations from the data

1. Capomulin and Ramicane have lower Average Tumor volume evidenced by the box plots. 
2. The Capomulin appears to show a reduction in Tumor volume between 20 and 40 days. However, this is only for one mouse.
3. The regression analysis shows a reasonably linear correlation between Tumor Volume and Weight.


In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np


# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
        #mouse_metadata.head
        #study_results.head 
        #Mouse ID is common to both files

combined_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
combined_df.head


In [None]:
# Checking the number of mice.
combined_df['Mouse ID'].value_counts()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicateRows = combined_df[combined_df.duplicated(['Mouse ID', 'Timepoint'])]
duplicateRows

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clnd_df = combined_df[combined_df['Mouse ID'] != 'g989']

In [None]:
# Checking the number of mice in the clean DataFrame.
clnd_df['Mouse ID'].value_counts()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen


tumor_data = clnd_df.groupby(['Drug Regimen'])
# This method is the most straighforward, creating multiple series and putting them all together at the end.
tumor_mean = tumor_data['Tumor Volume (mm3)'].mean()
tumor_med = tumor_data['Tumor Volume (mm3)'].median()
tumor_var = tumor_data['Tumor Volume (mm3)'].var()
tumor_std = tumor_data['Tumor Volume (mm3)'].std()
tumor_sem = tumor_data['Tumor Volume (mm3)'].sem()

tumordata_df = pd.DataFrame({
                        "Mean": tumor_mean,
                        "Median" : tumor_med,
                        "Variance" : tumor_var,
                        "Standard Deviation" : tumor_std,
                        "SEM" : tumor_sem
                        })
tumordata_df


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line.

statstab2 = clnd_df.groupby(['Drug Regimen'])['Tumor Volume (mm3)'].agg(['mean', 'median','var', 'std', 'sem'])
statstab2



## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

intervalstep = clnd_df.groupby(['Drug Regimen'])['Timepoint'].count()
intervalstep.sort_values(ascending=False).plot.bar()  
plt.ylabel('Number of Timepoints', size=8) # define y label
plt.xlabel('Drug Regimen',size=8) # define x label

plt.show()


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
micetotal = intervalstep.sort_values(ascending=False)
plt.bar(micetotal.index, micetotal.values)
plt.ylabel('Number of Timepoints', size=8) # define y label
plt.xlabel('Drug Regimen',size=8) # define x label
plt.xticks(rotation='vertical') # this is to rotate the x values
plt.show


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

Gender = clnd_df.groupby(['Sex'])['Sex'].count()
Gender.plot.pie(startangle=185,autopct='%1.1f%%', figsize=(8, 8))



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

malecount = (clnd_df['Sex'].value_counts()['Male'])
femalecount = (clnd_df['Sex'].value_counts()['Female'])
gendercount = [malecount,femalecount]
#print (gendercount)
gendernames = ['Male','Female']
explode = (0,0)
colors = ['Blue', 'Red']


plt.pie(gendercount, explode=explode, labels=gendernames, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=0)




## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
newclnd_df = clnd_df.copy()

# Start by getting the last (greatest) timepoint for each mouse
finltme = newclnd_df.groupby(['Mouse ID'])['Timepoint'].max()
finltme_df = pd.DataFrame(finltme)
#print(finltme_df)

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint

newmerged_df = pd.merge(newclnd_df, finltme_df, on=('Mouse ID', 'Timepoint'), how='right')
newmerged_df




In [None]:
# Put treatments into a list for for loop (and later for plot labels)

treatlist = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)

cap_vol = []
ram_vol = []
inf_vol = []
cef_vol = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes

for index, row in newmerged_df.iterrows():
    if row["Drug Regimen"] == "Capomulin":
        cap_vol.append(row['Tumor Volume (mm3)'])
    if row["Drug Regimen"] == "Ramicane":
        ram_vol.append(row['Tumor Volume (mm3)'])
    if row["Drug Regimen"] == "Infubinol":
        inf_vol.append(row['Tumor Volume (mm3)'])
    if row["Drug Regimen"] == "Ceftamin":
        cef_vol.append(row['Tumor Volume (mm3)'])
        
        
    # add subset 
regimen_df = pd.DataFrame({'Capomulin':cap_vol,"Ramicane":ram_vol,"Infubinol":inf_vol,"Ceftamin":cef_vol})
    
  # Determine outliers using upper and lower bounds
searchdata1 = regimen_df['Capomulin']
 
quartiles = searchdata1.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of Capomulin is: {lowerq}")
print(f"The upper quartile of Capomulin is: {upperq}")
print(f"The interquartile range of Capomulin is: {iqr}")
print(f"The median of Capomulin is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")    
print('-------------------------------------------------')

searchdata2 = regimen_df['Ramicane']
    
quartiles = searchdata2.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of Ramicane is: {lowerq}")
print(f"The upper quartile of Ramicane is: {upperq}")
print(f"The interquartile range of Ramicane is: {iqr}")
print(f"The median of Ramicane is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")        
print('-------------------------------------------------')    
   
searchdata3 = regimen_df['Infubinol']
    
    
quartiles = searchdata3.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of Infubinol is: {lowerq}")
print(f"The upper quartile of Infubinol is: {upperq}")
print(f"The interquartile range of Infubinol is: {iqr}")
print(f"The median of Infubinol is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")  
print('-------------------------------------------------')
    
searchdata4 = regimen_df['Ceftamin']
    
    
quartiles = searchdata4.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of Ceftamin is: {lowerq}")
print(f"The upper quartile of Ceftamin is: {upperq}")
print(f"The interquartile range of Ceftamin is: {iqr}")
print(f"The median of Ceftamin is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")  
print('-------------------------------------------------')
   


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.

tumvols = [cap_vol,ram_vol,inf_vol,cef_vol]
boxlabels = ['Capomulin','Ramicane','Infubinol','Ceftamin']
fig1, ax1 = plt.subplots()
ax1.set_title('Tumor Volumes For Drug Regimen')
ax1.set_ylabel('Tumor Volume (mm3)')
#ax1.set_xlabel(boxlabels)

ax1.boxplot(tumvols)

plt.xticks([1,2,3,4],boxlabels)

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
l_plot_df = clnd_df.loc[clnd_df["Mouse ID"] == "l509",:]

time_vals = l_plot_df['Timepoint']
vol_vals = l_plot_df['Tumor Volume (mm3)']
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Capomulin treatmeant of mouse l509')

plt.plot(time_vals,vol_vals)
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

mousewt_df = clnd_df.loc[clnd_df["Drug Regimen"] == "Capomulin",:]
meantumvol = mousewt_df.groupby(['Mouse ID']).mean()
#meantumvol

meantumvol.rename(columns={'Tumor Volume (mm3)':'Tumor_Volume_(mm3)'}, inplace=True)
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')

plt.scatter(meantumvol['Weight (g)'],meantumvol['Tumor_Volume_(mm3)'])


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

corr=round(st.pearsonr(meantumvol['Weight (g)'],meantumvol['Tumor_Volume_(mm3)'])[0],2)
print(f"The correlation between mouse weight and average tumor volume is {corr}")           

x_values = meantumvol['Weight (g)']
y_values = meantumvol['Tumor_Volume_(mm3)']

regmodel=st.linregress(meantumvol['Weight (g)'],meantumvol['Tumor_Volume_(mm3)'])
regmodel

slope = regmodel[0]
intercept = regmodel[1]
rvalue = regmodel[2]
pvalue = regmodel[3]
stderr = regmodel[4]

#Print Summary Table
print()
print('Linear Regression Table')
print('-------------------------------')
print('Slope :',slope)
print('Intercept :',intercept)
print('r-value :',rvalue)
print('p-value :',pvalue)
print('stderr :',stderr)
print('-------------------------------')

                                         

In [None]:
line = slope*(meantumvol['Weight (g)'])+intercept
plt.scatter(meantumvol['Weight (g)'],meantumvol['Tumor_Volume_(mm3)'])
plt.plot(meantumvol['Weight (g)'], line, 'r', label='y={:.2f}x+{:.2f}'.format(slope,intercept))
plt.xlabel('Weight(g)')
plt.ylabel('Average Tumore Volume (mm3)')
plt.legend()
plt.show()
