# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
Pymamceuticals_Data = pd.merge (mouse_metadata, study_results, how="left", on="Mouse ID")

# Display the data table for preview
Pymamceuticals_Data.head()[['Mouse ID', 'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites', 
'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)']]

In [None]:
# Checking the number of mice.
print(f"Number of Unique Mice: {Pymamceuticals_Data ["Mouse ID"].nunique()}")

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicates= Pymamceuticals_Data[Pymamceuticals_Data.duplicated(subset= ["Mouse ID","Timepoint"])]
duplicates

In [None]:
dup_mice = Pymamceuticals_Data.loc[Pymamceuticals_Data["Mouse ID"] == "g989", :]
dup_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID.


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_study_df = Pymamceuticals_Data.loc[(Pymamceuticals_Data['Mouse ID'] != 'g989')] 
clean_study_df

In [None]:
# Checking the number of mice in the clean DataFrame.
print(f"Number of Unique Mice: {clean_study_df ["Mouse ID"].nunique()}")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
# mean, median, variance, standard deviation, and SEM of the tumor volume.
# Assemble the resulting series into a single summary DataFrame.

mean_tumor = clean_study_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
median_tumor = clean_study_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
variance_tumor = clean_study_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
stdev_tumor = clean_study_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
sem_tumor = clean_study_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()


summary_stats_df = pd.DataFrame({
    "Mean Tumor Volume": mean_tumor,
    "Median Tumor Volume": median_tumor,
    "Tumor Volume Variance": variance_tumor,
    "Tumor Volume Std. Dev.": stdev_tumor,
    "Tumor Volume Std. Err.": sem_tumor
})


summary_stats_df

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
summary_stats_df2 = clean_study_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].agg(Mean="mean",Median="median",Variance="var",Std_Dev="std",SEM="sem").reset_index()
summary_stats_df2

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.


mice_per_drug = clean_study_df["Drug Regimen"].value_counts().plot(kind="bar", figsize = (9.5,7.5))

mice_per_drug.set_xlabel ("Drug Regimen", size = 15)
mice_per_drug.set_ylabel ("# of Observed Mice", size = 15)
plt.title("Number of Mice per Drug Regimen", size = 22)
plt.ylim(0, max(clean_study_df["Drug Regimen"].value_counts() + 20))
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)


# mice_count
mice_per_drug

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
mice_per_drug = clean_study_df["Drug Regimen"].value_counts()

plt.figure(figsize=(9.5,7.5))
plt.bar(mice_per_drug.index, mice_per_drug.values)
plt.title("Number of Mice per Drug Regimen", size = 20)
plt.xlabel("Drug Regimen", size =16)
plt.ylabel("# of Observed Mice", size = 12)
plt.ylim(0, max(clean_study_df["Drug Regimen"].value_counts() + 20))
plt.xticks(rotation="vertical", fontsize=10)
plt.yticks(fontsize=10)

plt.show()

In [None]:
# Generate a pie chart, using Pandas, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender

gender_distribution = clean_study_df["Sex"].value_counts()

plt.figure()
figure=gender_distribution.plot(kind="pie",title="Gender Distribution", y= gender_distribution.index, autopct= "%1.1f%%", figsize=(8, 6), labels = gender_distribution.index, colors = ["blue","red"])
plt.axis('equal')

# Make the pie chart
plt.show()

In [None]:
# Generate a pie chart, using pyplot, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender
colors = ["blue","red"]
labels = gender_distribution.index
plt.figure(figsize=(10, 8))
plt.pie(gender_distribution, labels=labels, colors=colors,autopct="%1.1f%%")
plt.title("Distribution of Gender in Test")
plt.axis('equal') 
# Make the pie chart
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
treatment_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
tumor_volume_regimen = clean_study_df[clean_study_df["Drug Regimen"].isin(treatment_regimens)]

# # Start by getting the last (greatest) timepoint for each mouse
greatest_timepoint = clean_study_df.groupby(["Mouse ID"])["Timepoint"].max().reset_index()

# # Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
final_tumor_volume_df = greatest_timepoint.merge(clean_study_df,on=['Mouse ID','Timepoint'],how="left")
final_tumor_volume_df


In [None]:
treatment_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []
for regimen in treatment_regimens:
    # Locate the rows which contain mice on each drug and get the tumor volumes
    current_regimen_data = final_tumor_volume_df.loc[final_tumor_volume_df["Drug Regimen"] == regimen, 'Tumor Volume (mm3)']

    # add subset 
    tumor_vol_data.append(current_regimen_data)
    
    # Calculate the IQR and quantitatively determine if there are any potential outliers. 
    quartiles = current_regimen_data.quantile([0.25,0.75])
    lower_quartile = quartiles[0.25]
    upper_quartile = quartiles[0.75]
    iqr = upper_quartile - lower_quartile
    
    # Determine outliers using upper and lower bounds
    lower_bound = lower_quartile - 1.5 * iqr
    upper_bound = upper_quartile + 1.5 * iqr
    outliers = current_regimen_data[(current_regimen_data < lower_bound) | (current_regimen_data > upper_bound)]
    print(f"{regimen}'s potential outliers:{outliers}")

In [None]:
# Generate a box plot that shows the distribution of the tumor volume for each treatment group.
plt.figure()
plt.boxplot(tumor_vol_data, labels=treatment_regimens)
plt.ylabel("Final Tumor Volume (mm3)")
red_out = dict(markerfacecolor="red", markersize=14)
plt.show()

## Line and Scatter Plots

## Correlation and Regression

In [132]:
# Calculate the correlation coefficient and a linear regression model
# for mouse weight and average observed tumor volume for the entire Capomulin regimen


NameError: name 'stats' is not defined