# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
combined = pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")
combined.head()

# Display the data table for preview


In [None]:
# Checking the number of mice. 
combined["Mouse ID"].nunique()


In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

duplicate = combined.loc[combined.duplicated(subset= ["Mouse ID", "Timepoint"]),"Mouse ID"].unique()
duplicate



In [None]:
# Optional: Get all the data for the duplicate mouse ID. 


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

clean = combined.loc[combined["Mouse ID"].isin(duplicate)== False]
clean


In [None]:
# Checking the number of mice in the clean DataFrame.
clean["Mouse ID"].nunique()

## Summary Statistics

In [None]:
clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.
means = clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
medians = clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
variances = clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
stds = clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
sems = clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()
summary1 = pd.DataFrame({
    "Mean Tumor Volume" : means,
    "Median Tumor Volume" : medians,
    "Tumor Volume Variance" : variances,
    "Tumor Volume Std. Dev." : stds,
    "Tumor Volume Std. Err." : sems
})
summary1


In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
summary1 = clean.groupby("Drug Regimen")["Tumor Volume (mm3)"].agg(['mean', 'median', 'var', 'std', 'sem']).rename(
    columns={'mean': 'Mean Tumor Volume',
             'median': 'Median Tumor Volume',
             'var': 'Tumor Volume Variance',
             'std': 'Tumor Volume Std. Dev.',
             'sem': 'Tumor Volume Std. Err.'})
summary1

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
import matplotlib.pyplot as plt

# Count the occurrences of each drug regimen
drug_regimen_counts = clean['Drug Regimen'].value_counts()

# Plot the results
drug_regimen_counts.plot(kind='bar', figsize=(10, 6), color='blue')

# Add labels and title
plt.title('Total Number of Rows for Each Drug Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel("# of observed mouse time points")
# Show plot
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
import matplotlib.pyplot as plt

# Count the occurrences of each drug regimen
drug_regimen_counts = clean['Drug Regimen'].value_counts()

# Plot the results
drug_regimen_counts.plot(kind='bar', figsize=(10, 6), color='blue')

# Add labels and title
plt.title('Total Number of Rows for Each Drug Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Rows')

# Show plot
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.show()

In [None]:
# Count the occurrences of each gender
gender_counts = clean['Sex'].value_counts()

# Plot the results as a pie chart
gender_counts.plot(kind='pie', figsize=(8, 8), autopct='%1.1f%%', colors=['lightblue', 'lightpink'])

# Add title and labels
plt.title('Distribution of Female vs. Male Mice')
plt.ylabel('')  # Remove y-axis label

# Show plot
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# Count the occurrences of each gender
gender_counts = clean['Sex'].value_counts()

# Extract gender labels and their corresponding counts
gender_labels = gender_counts.index
gender_values = gender_counts.values

# Create a pie plot
plt.figure(figsize=(8, 8))
plt.pie(gender_values, labels=gender_labels, autopct='%1.1f%%', colors=['blue', 'orange'])

# Add title
plt.text(-1.1, 0, 'Sex', fontsize=16, rotation=90, verticalalignment='center')



# Show plot
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Step 1: Get the last (greatest) timepoint for each mouse
last_timepoint = clean.groupby('Mouse ID')['Timepoint'].max().reset_index()

# Step 2: Merge this group DataFrame with the original DataFrame to get the tumor volume at the last timepoint
merged_df = pd.merge(last_timepoint, clean, on=['Mouse ID', 'Timepoint'], how='left')

# Step 3: Filter the merged DataFrame to include only the specified treatment regimens
selected_regimens = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
final_tumor_volume_df = merged_df[merged_df['Drug Regimen'].isin(selected_regimens)]

# Step 4: Extract the final tumor volume of each mouse for further analysis
final_tumor_volume = final_tumor_volume_df[['Mouse ID', 'Drug Regimen', 'Tumor Volume (mm3)']]

# Display the final tumor volume of each mouse
print(final_tumor_volume)


In [None]:
# Step 1: Put treatments into a list for the for loop and later for plot labels
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Step 2: Create empty list to fill with tumor vol data (for plotting)
tumor_volume_data = []

# Step 3: Calculate the IQR and quantitatively determine if there are any potential outliers
for treatment in treatments:
    # Filter the final tumor volume DataFrame for the current treatment regimen
    treatment_df = final_tumor_volume[final_tumor_volume['Drug Regimen'] == treatment]
    
    # Get tumor volume data for the current treatment regimen
    tumor_volume = treatment_df['Tumor Volume (mm3)']
    
    # Add tumor volume data to the list for plotting
    tumor_volume_data.append(tumor_volume)
    
    # Calculate quartiles and IQR
    quartiles = tumor_volume.quantile([0.25, 0.5, 0.75])
    lower_quartile = quartiles[0.25]
    upper_quartile = quartiles[0.75]
    iqr = upper_quartile - lower_quartile
    
    # Determine upper and lower bounds
    lower_bound = lower_quartile - (1.5 * iqr)
    upper_bound = upper_quartile + (1.5 * iqr)
    
    # Determine potential outliers
    outliers = treatment_df[(treatment_df['Tumor Volume (mm3)'] < lower_bound) | 
                            (treatment_df['Tumor Volume (mm3)'] > upper_bound)]
    
    # Print results
    print(f"Results for {treatment}:")
    print(f"IQR: {iqr}")
    print(f"Lower Quartile: {lower_quartile}")
    print(f"Upper Quartile: {upper_quartile}")
    print(f"Lower Bound: {lower_bound}")
    print(f"Upper Bound: {upper_bound}")
    print(f"Potential Outliers: {outliers}\n")


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
import matplotlib.pyplot as plt

# Create box plot
plt.figure(figsize=(10, 6))
plt.boxplot(tumor_volume_data, labels=treatments)
plt.title('Distribution of Tumor Volume for Each Treatment Group')
plt.xlabel('Treatment Regimen')
plt.ylabel('Tumor Volume (mm3)')
plt.grid(True)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
# Step 1: Filter the DataFrame to include only data for mice treated with Capomulin
capomulin_data = clean[clean['Drug Regimen'] == 'Capomulin']

# Step 2: Select data for a single mouse from the filtered DataFrame
# For example, let's select data for the first mouse in the Capomulin group
mouse_id = capomulin_data['Mouse ID'].iloc[0]
single_mouse_data = capomulin_data[capomulin_data['Mouse ID'] == mouse_id]

# Step 3: Plot tumor volume vs. time point for the selected mouse
plt.figure(figsize=(10, 6))
plt.plot(single_mouse_data['Timepoint'], single_mouse_data['Tumor Volume (mm3)'], marker='o', linestyle='-', color='b')
plt.title(f'Capomulin treatment of mouse 1509')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.grid(True)
plt.show()

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
# Step 1: Group the Capomulin data by mouse ID to calculate the average tumor volume for each mouse
capomulin_grouped = capomulin_data.groupby('Mouse ID')

# Step 2: Calculate the average tumor volume for each mouse
average_tumor_volume = capomulin_grouped['Tumor Volume (mm3)'].mean()

# Step 3: Merge the average tumor volume data with the mouse weight data
capomulin_avg_tumor_volume_df = pd.merge(average_tumor_volume, clean[['Mouse ID', 'Weight (g)']], on='Mouse ID', how='left')

# Step 4: Plot mouse weight vs. the average tumor volume
plt.figure(figsize=(10, 6))
plt.scatter(capomulin_avg_tumor_volume_df['Weight (g)'], capomulin_avg_tumor_volume_df['Tumor Volume (mm3)'], color='blue')
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.grid(True)
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
from scipy.stats import linregress, pearsonr
import matplotlib.pyplot as plt

# Calculate the correlation coefficient
correlation_coefficient, _ = pearsonr(capomulin_avg_tumor_volume_df['Weight (g)'], capomulin_avg_tumor_volume_df['Tumor Volume (mm3)'])
print("Correlation Coefficient:", correlation_coefficient)

# Fit a linear regression model
slope, intercept, r_value, p_value, std_err = linregress(capomulin_avg_tumor_volume_df['Weight (g)'], capomulin_avg_tumor_volume_df['Tumor Volume (mm3)'])

# Plot scatter plot and regression line
plt.figure(figsize=(10, 6))
plt.scatter(capomulin_avg_tumor_volume_df['Weight (g)'], capomulin_avg_tumor_volume_df['Tumor Volume (mm3)'], color='blue', label='Data')
plt.plot(capomulin_avg_tumor_volume_df['Weight (g)'], slope * capomulin_avg_tumor_volume_df['Weight (g)'] + intercept, color='red', label='Linear Regression')
plt.title('Mouse Weight vs. Average Tumor Volume (Capomulin Regimen)')
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.legend()
plt.grid(True)
plt.show()