In [None]:
! pip install sweetviz

In [None]:
! pip install openpyxl

# 1. Read in the GDSC Dataset

In [None]:
import pandas as pd
gdsc = pd.read_excel('gdsc.xlsx')

In [None]:
columns_to_drop = ['NLME_RESULT_ID','NLME_CURVE_ID','COSMIC_ID', 'SANGER_MODEL_ID', 'COMPANY_ID', 'COMPANY_ID', 'RMSE', 'DATASET', 'WEBRELEASE']
gdsc_alt = gdsc.drop(columns_to_drop, inplace=False, axis=1)


# 2. Explore Generally using SweetViz

In [None]:
gdsc

In [None]:
import sweetviz as sv
# analysis = sv.analyze(gdsc)
# analysis.show_html('gdsc.html', open_browser=True, layout='widescreen')

In [None]:
columns_to_drop = ['NLME_RESULT_ID','NLME_CURVE_ID','COSMIC_ID', 'SANGER_MODEL_ID', 'COMPANY_ID', 'COMPANY_ID', 'RMSE', 'DATASET', 'WEBRELEASE']
gdsc_alt = gdsc.drop(columns_to_drop, inplace=False, axis=1)
# analysis = sv.analyze(gdsc_alt)
# analysis.show_html('gdsc_alt.html', open_browser=True, layout='widescreen')

# 3. Explore Specifically using Pandas and Generate Insights

In [None]:
! pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
gdsc_alt

## Basic Summary Statistics

In [None]:
total_entries = gdsc_alt.shape[0]
total_entries


In [None]:
unique_drugs = gdsc_alt["DRUG_NAME"].nunique()
unique_drugs

## Average Values

In [None]:
average_min_conc = gdsc_alt["MIN_CONC"].mean()
average_max_conc = gdsc_alt["MAX_CONC"].mean()
average_ln_ic50 = gdsc_alt["LN_IC50"].mean()
average_auc = gdsc_alt["AUC"].mean()


print(f"Average min conc: {average_min_conc}")
print(f"Average max conc: {average_max_conc}")
print(f"Average ln ic50: {average_ln_ic50}")
print(f"Average auc: {average_auc}")



## Visualisation and Correlation

In [None]:
# Calculate the drug counts
drug_counts = gdsc_alt["DRUG_NAME"].value_counts().reset_index()
drug_counts.columns = ["Drug Name", "Count"]

# Display the table
print(drug_counts)




In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))

sns.histplot(gdsc_alt["LN_IC50"], kde=True, ax=axes[0])
axes[0].set_title("Distribution of LN_IC50")
mean_ln_ic50 = gdsc_alt["LN_IC50"].mean()
std_ln_ic50 = gdsc_alt["LN_IC50"].std()
var_ln_ic50 = gdsc_alt["LN_IC50"].var()

print("Mean of LN_IC50:", mean_ln_ic50)
print("Standard Deviation of LN_IC50:", std_ln_ic50)
print("Variance of LN_IC50:", var_ln_ic50)


# AUC plot with adjusted KDE
sns.histplot(gdsc_alt["AUC"], kde=True, stat="probability", ax=axes[1])
axes[1].set_title("Distribution of AUC")


plt.tight_layout()
plt.show()


LN_IC50: Gives insights into the natural logarithm of the half-maximal inhibitory concentration. Understanding the distribution can provide insights into how different cancer cells respond to different drug concentrations.

AUC: The Area Under the Curve for the dose-response curve is a critical metric in pharmacology. If most of the values cluster around higher values, it suggests that many drugs had effective responses across various concentrations. Conversely, if they cluster around lower values, it indicates less efficacy.

In [None]:
# Iterate over columns that should be purely numerical
for column in ['LN_IC50', 'AUC', 'Z_SCORE']:
    try:
        # Try to convert the entire column to float
        gdsc_alt[column] = gdsc_alt[column].astype(float)
    except ValueError as e:
        # If there's an error (i.e., some entries can't be converted), print the column name and the error
        print(f"Column {column} has non-numeric data: {e}")



In [None]:
# List of numeric columns for which we want to compute correlations
numeric_cols = ['LN_IC50', 'AUC', 'Z_SCORE']

# Compute the correlation matrix for these columns
correlation_matrix = gdsc_alt[numeric_cols].corr()

# Print the resulting matrix
print(correlation_matrix)


In [None]:
# Creates a Heatmap of the correlation matrix using plotly

! pip install plotly

In [None]:
! pip install --upgrade nbformat


In [None]:
! pip install --upgrade nbformat

In [None]:
import plotly.figure_factory as ff

# List of numeric columns for which we want to compute correlations
numeric_cols = ['LN_IC50', 'AUC', 'Z_SCORE']

# Compute the correlation matrix for these columns
correlation_matrix = gdsc_alt[numeric_cols].corr()

# Create a heatmap using plotly
fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    annotation_text=correlation_matrix.round(2).values,
    colorscale='Viridis', # You can change this to other color scales like 'Blues', 'Reds', etc.
    hoverinfo='z'
)

# Update layout for better visuals
fig.update_layout(
    title='Correlation Matrix',
    xaxis=dict(title='Variable'),
    yaxis=dict(title='Variable', autorange="reversed"),
    width=600, height=600
)

# Show the plot
fig.show()


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=gdsc_alt, x="AUC", y="LN_IC50")
plt.title("AUC vs. LN_IC50")
plt.show()

In [None]:
! pip install statsmodels


In [None]:
! pip install ipywidgets

In [None]:
import ipywidgets as widgets
import plotly.express as px
from IPython.display import display

# Dropdown menu for drug selection
drug_dropdown = widgets.Dropdown(
    options=gdsc_alt['DRUG_NAME'].unique(),
    value=gdsc_alt['DRUG_NAME'].unique()[0],
    description='Drug:',
)

# Function to update the scatter plot based on drug selection
def update_plot(drug):
    filtered_data = gdsc_alt[gdsc_alt['DRUG_NAME'] == drug]
    
    fig = px.scatter(filtered_data, x="AUC", y="LN_IC50", color="DRUG_NAME", 
                     hover_data=['CELL_LINE_NAME', 'DRUG_NAME'],
                     color_discrete_sequence=px.colors.qualitative.Set1
                    )
    
    fig.update_layout(title=f"AUC vs. LN_IC50 for {drug}", 
                      xaxis_title="AUC", 
                      yaxis_title="LN_IC50"
                     )
    
    fig.show()

widgets.interactive(update_plot, drug=drug_dropdown)


# Potential Machine Learning Models

1. Predict LN_IC50 using AUC and other features of a drug
2. Predict AUC using LN_IC50 and other features of a drug
3. Predict drug response using LN_IC50 and other features of a drug
4. predict ln_ic50 and auc using other features of a drug