# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
data_merged = pd.merge(study_results, mouse_metadata, how="left")

# Display the data table for preview
data_merged.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data/Mouse_metadata.csv'

In [None]:
# Checking the number of mice.
data_merged["Mouse ID"].nunique()

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dup_mice = []
dup_data = data_merged.duplicated(subset=["Mouse ID", "Timepoint"])
dup_ids = data_merged[dup_data]["Mouse ID"].unique()

for id in dup_ids:
    dup_mice.append(dup_ids)
    
print(dup_mice)

In [None]:
# Optional: Get all the data for the duplicate mouse ID.
data_merged[data_merged['Mouse ID']=='g989']

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# dup_indexes = [137,360,681,869,1111]
# data_complete = data_merged.drop(dup_indexes,inplace=True)
# data_complete


data_complete = data_merged.drop(data_merged[data_merged['Mouse ID']=='g989'].index)
data_complete.head()


In [None]:
# Checking the number of mice in the clean DataFrame.
data_complete["Mouse ID"].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

summ_stats = data_complete.groupby('Drug Regimen')['Tumor Volume (mm3)'].agg(['mean', 'median', 'var', 'std', 'sem'])
summ_stats.rename(columns={
    'mean': 'Mean Tumor Volume',
    'median': 'Median Tumor Volume',
    'var': 'Tumor Volume Variance',
    'std': 'Tumor Volume Std. Dev.',
    'sem': 'Tumor Volume Std. Err.'
},inplace=True)
summ_stats

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
summ_stats_agg = data_complete.groupby(by='Drug Regimen')['Tumor Volume (mm3)'].agg(['mean','median','var','std','sem'])
summ_stats_agg

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
regimen_counts = data_complete.groupby('Drug Regimen').size().sort_values(ascending=False)

regimen_counts.plot(kind='bar',color='blue')
plt.xlabel('Drug Regimen')
plt.ylabel('# of Observed Mouse Timepoints')

plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
regimen_counts2 = data_complete['Drug Regimen'].value_counts()

plt.bar(regimen_counts2.index, regimen_counts2.values)
plt.xlabel('Drug Regimen')
plt.ylabel('# of Observed Mouse Timepoints')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_distribution = data_complete['Sex'].value_counts()

plt.pie(gender_distribution,labels=gender_distribution.index,autopct='%1.1f%%')
plt.ylabel('Sex',rotation=90)
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(gender_distribution.values,labels=gender_distribution.keys(),autopct='%1.1f%%')
plt.ylabel('Sex',rotation=90)
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
max_timepoint = data_complete.groupby('Mouse ID')['Timepoint'].max().reset_index()
max_timepoint
# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
data_comp_max = pd.merge(data_complete,max_timepoint,on=['Mouse ID','Timepoint'])
data_comp_max.head()

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = {}

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for treatment in treatments:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    mice_per_drug = data_comp_max.loc[data_comp_max['Drug Regimen'] == treatment, 'Tumor Volume (mm3)']
    
    # add subset 
    tumor_vol_data[treatment] = mice_per_drug
    
    # Determine outliers using upper and lower bounds
    quartiles = mice_per_drug.quantile([0.25,0.75])
    lower_q = quartiles[0.25]
    upper_q = quartiles[0.75]
    iqr = upper_q - lower_q
    lower_bound = lower_q - 1.5 * iqr
    upper_bound = upper_q + 1.5 * iqr
    
    outliers = mice_per_drug.loc[(mice_per_drug < lower_bound) | (mice_per_drug > upper_bound)]

    print(f"{treatment}'s potential outliers: {outliers}")


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
plt.boxplot([data_comp_max[data_comp_max['Drug Regimen'] == treatment]['Tumor Volume (mm3)'] for treatment in treatments],labels=treatments)
plt.ylabel('Final Tumor Volume (mm3)')
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
