<a href="https://colab.research.google.com/github/Swathi2603/Mini-Projects/blob/main/Clinical%20Trial%20Site%20Initialisation%20Planning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import xgboost as xgb
#test1:CTT-FP
#test2:FP-FSI

In [None]:
file_path = '/content/Sample Data (1) (2) (1) (2).xlsx'
data = pd.read_excel(file_path)
study_df = pd.read_excel(file_path, sheet_name='study')
study_country_df = pd.read_excel(file_path, sheet_name='study_country')
print(data.head())
print(data.columns)

  StudyCode DevelopmentUnit StudyStatus      Phase PediatricOnly  \
0  CXD4368A     Respiratory    Complete    Phase I            No   
1  CXD1591A  Cardiovascular      Active   Phase IV            No   
2  CXD2664A        Oncology    Complete  Phase III           Yes   
3  CXD6887A     Respiratory    Complete   Phase IV           Yes   
4  CXD8680A  Cardiovascular      Active  Phase III            No   

  New Indication      Blinding  CTTActual ActualFinalProtocol  CTT-FP_weeks  
0            Yes  Single Blind 2018-05-12          2019-05-20     53.285714  
1             No    Open Label 2018-05-17          2019-06-16     56.428571  
2            Yes  Double Blind 2018-09-07          2019-05-25     37.142857  
3            Yes    Open Label 2018-04-11          2019-08-08     69.142857  
4             No    Open Label 2018-05-30          2019-01-11     32.285714  
Index(['StudyCode', 'DevelopmentUnit', 'StudyStatus', 'Phase', 'PediatricOnly',
       'New Indication', 'Blinding', 'CTTAc

In [None]:
merged_df = pd.merge(study_country_df, study_df, on='StudyCode')
print(merged_df.head())
print(merged_df.isnull().sum())


  StudyCode    Country CountryHASubActual Country1SIVActual  \
0  CXD4368A  Australia         2020-06-21        2021-07-19   
1  CXD4368A        USA         2019-12-08        2020-12-20   
2  CXD4368A      Spain         2019-10-21        2020-08-09   
3  CXD4368A     Brazil         2021-03-15        2020-06-05   
4  CXD4368A      Japan         2020-06-08        2020-10-04   

  ActualFinalProtocol_x  FP-FSI_weeks DevelopmentUnit StudyStatus    Phase  \
0            2019-05-20    113.000000     Respiratory    Complete  Phase I   
1            2019-05-20     82.857143     Respiratory    Complete  Phase I   
2            2019-05-20     63.857143     Respiratory    Complete  Phase I   
3            2019-05-20     54.571429     Respiratory    Complete  Phase I   
4            2019-05-20     71.857143     Respiratory    Complete  Phase I   

  PediatricOnly New Indication      Blinding  CTTActual ActualFinalProtocol_y  \
0            No            Yes  Single Blind 2018-05-12            2019

In [None]:
#CTT-FP
feature_columns = [ 'DevelopmentUnit', 'Phase', 'New Indication','Blinding','PediatricOnly']  # Update these names based on the actual column names
target_column = 'CTT-FP_weeks'

# Extracting features and target variable
X = data[feature_columns]
y = data[target_column]
X = pd.get_dummies(X, drop_first=False)

# Splitting the data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)
X_train1.columns

Index(['DevelopmentUnit_Cardiovascular', 'DevelopmentUnit_NeuroScience',
       'DevelopmentUnit_Oncology', 'DevelopmentUnit_Respiratory',
       'Phase_Phase I', 'Phase_Phase II', 'Phase_Phase III', 'Phase_Phase IV',
       'New Indication_No', 'New Indication_Yes', 'Blinding_Double Blind',
       'Blinding_Open Label', 'Blinding_Single Blind', 'PediatricOnly_No',
       'PediatricOnly_Yes'],
      dtype='object')

In [None]:
#FP-FSI
feature_columns = [ 'Country','DevelopmentUnit', 'Phase', 'New Indication','Blinding','PediatricOnly']  # Update these names based on the actual column names
target_column = 'FP-FSI_weeks'

# Extracting features and target variable
X = merged_df[feature_columns]
y = merged_df[target_column]
X = pd.get_dummies(X, drop_first=False)

# Splitting the data into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=42)
X_train2.columns



Index(['Country_Argentina', 'Country_Australia', 'Country_Brazil',
       'Country_Canada', 'Country_China', 'Country_France', 'Country_India',
       'Country_Italy', 'Country_Japan', 'Country_South Africa',
       'Country_Spain', 'Country_UK', 'Country_USA',
       'DevelopmentUnit_Cardiovascular', 'DevelopmentUnit_NeuroScience',
       'DevelopmentUnit_Oncology', 'DevelopmentUnit_Respiratory',
       'Phase_Phase I', 'Phase_Phase II', 'Phase_Phase III', 'Phase_Phase IV',
       'New Indication_No', 'New Indication_Yes', 'Blinding_Double Blind',
       'Blinding_Open Label', 'Blinding_Single Blind', 'PediatricOnly_No',
       'PediatricOnly_Yes'],
      dtype='object')

In [None]:
#CTT-FP
param_grid = {
    'n_estimators': [100,300,500],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train1, y_train1)
print("Best parameters found: ", grid_search.best_params_)


# Training the model with the best parameters
best_rf = grid_search.best_estimator_



Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found:  {'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}


In [None]:
y_pred1 = best_rf.predict(X_test1)
mse = mean_squared_error(y_test1, y_pred1)
print(f'Mean Squared Error: {mse}')
mae = mean_absolute_error(y_test1, y_pred1)
print(f'Mean Absolute Error (MAE): {mae}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')
target_range = y.max() - y.min()
normalized_mse = mse / target_range
print(f'Normalized MSE: {normalized_mse}')
normalized_rmse = rmse / target_range
print(f'Normalized RMSE: {normalized_rmse}')
accuracy = (1 - normalized_rmse) * 100
print(f'Accuracy: {accuracy:.2f}%')


Mean Squared Error: 219.85321845274657
Mean Absolute Error (MAE): 11.782208347418972
Root Mean Squared Error (RMSE): 14.827448143653937
Normalized MSE: 2.249959837966704
Normalized RMSE: 0.1517428903590315
Accuracy: 84.83%


In [None]:
#FP_FSI
param_grid = {
    'n_estimators': [100,300,500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', enable_categorical=True)
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
grid_search.fit(X_train2, y_train2)
print("Best parameters found: ", grid_search.best_params_)

# Training the model with the best parameters
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}


In [None]:
y_pred2 = best_model.predict(X_test2)
mse = mean_squared_error(y_test2, y_pred2)
print(f'Mean Squared Error(MSE): {mse}')
mae = mean_absolute_error(y_test2, y_pred2)
print(f'Mean Absolute Error (MAE): {mae}')
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')
target_range = y.max() - y.min()
normalized_mse = mse / target_range
print(f'Normalized MSE: {normalized_mse}')
baseline_mse = 482
relative_improvement = (baseline_mse - mse) / baseline_mse * 100
print(f'Relative Improvement: {relative_improvement:.2f}%')
mse = mean_squared_error(y_test2, y_pred2)
rmse = np.sqrt(mse)

# Calculate the range of the target variable
target_range = y.max() - y.min()

# Normalize RMSE
normalized_rmse = rmse / target_range

# Print the results
print(f'Normalized RMSE: {normalized_rmse}')
accuracy = (1 - normalized_rmse) * 100
print(f'Accuracy: {accuracy:.2f}%')





Mean Squared Error(MSE): 309.3768297048761
Mean Absolute Error (MAE): 14.38391435317182
Root Mean Squared Error (RMSE): 17.589111111846332
Normalized MSE: 3.1661371461025327
Relative Improvement: 35.81%
Normalized RMSE: 0.18000552307445072
Accuracy: 82.00%


In [None]:
import numpy as np
import pandas as pd

# Function to get user inputs
def get_user_input():
    # Prompting the user for inputs
    Country = input("Country (Argentina, Australia, Brazil, Canada, China, France, India, Italy, Japan, South Africa, Spain, UK, USA): ")
    development_unit = input("Development Unit (Cardiovascular, NeuroScience, Oncology, Respiratory): ")
    phase = input("Phase (Phase I, Phase II, Phase III, Phase IV): ")
    new_indication = input("Is this a new indication? (Yes, No): ")
    blinding = input("Enter Blinding (Double Blind, Open Label, Single Blind): ")
    pediatric = input("Is this pediatric only (Yes, No): ")

    # Constructing the user input dictionary
    user_input = {
        'Country': Country,
        'DevelopmentUnit': development_unit,
        'Phase': phase,
        'New Indication': new_indication,
        'Blinding': blinding,
        'PediatricOnly': pediatric
    }
    return user_input

def convert_to_dataframe(user_input, columns):
    # Initialize the DataFrame with zeros
    input_df = pd.DataFrame(np.zeros((1, len(columns))), columns=columns)


    for key, value in user_input.items():
        column_name = f"{key}_{value}"
        if column_name in columns:
            input_df.at[0, column_name] = 1
        else:
            print(f"Column '{column_name}' not found in columns")

    return input_df

# Define the columns as provided
columns1 = pd.Index(['DevelopmentUnit_Cardiovascular', 'DevelopmentUnit_NeuroScience', 'DevelopmentUnit_Oncology', 'DevelopmentUnit_Respiratory',
                     'Phase_Phase I', 'Phase_Phase II', 'Phase_Phase III', 'Phase_Phase IV',
                     'New Indication_No', 'New Indication_Yes', 'Blinding_Double Blind', 'Blinding_Open Label', 'Blinding_Single Blind',
                     'PediatricOnly_No', 'PediatricOnly_Yes'])

columns2 = pd.Index(['Country_Argentina', 'Country_Australia', 'Country_Brazil', 'Country_Canada', 'Country_China', 'Country_France',
                     'Country_India', 'Country_Italy', 'Country_Japan', 'Country_South Africa', 'Country_Spain', 'Country_UK', 'Country_USA',
                     'DevelopmentUnit_Cardiovascular', 'DevelopmentUnit_NeuroScience', 'DevelopmentUnit_Oncology', 'DevelopmentUnit_Respiratory',
                     'Phase_Phase I', 'Phase_Phase II', 'Phase_Phase III', 'Phase_Phase IV',
                     'New Indication_No', 'New Indication_Yes', 'Blinding_Double Blind', 'Blinding_Open Label', 'Blinding_Single Blind',
                     'PediatricOnly_No', 'PediatricOnly_Yes'])

# Prompt user for inputs
user_input = get_user_input()

# Prepare user inputs for prediction
user_input_CTT_FP = {
    'DevelopmentUnit': user_input['DevelopmentUnit'],
    'Phase': user_input['Phase'],
    'New Indication': user_input['New Indication'],
    'Blinding': user_input['Blinding'],
    'PediatricOnly': user_input['PediatricOnly']
}

user_input_FP_FSI = {
    'Country': user_input['Country'],
    'DevelopmentUnit': user_input['DevelopmentUnit'],
    'Phase': user_input['Phase'],
    'New Indication': user_input['New Indication'],
    'Blinding': user_input['Blinding'],
    'PediatricOnly': user_input['PediatricOnly']
}

# Convert the user input to DataFrames
input_df1 = convert_to_dataframe(user_input_CTT_FP, columns1)
input_df2 = convert_to_dataframe(user_input_FP_FSI, columns2)

# Make predictions
y_pred1 = best_rf.predict(input_df1)
y_pred2 = best_model.predict(input_df2)

# Rounding predictions
rounded_y_pred1 = round(y_pred1[0])
rounded_y_pred2 = round(y_pred2[0])

# Output results
print("Predicted CTT-FP Weeks:", rounded_y_pred1, "Weeks")
print("Predicted FP-FSI Weeks:", rounded_y_pred2, "Weeks")
print("TOTAL WEEKS FOR CTT-FSI:", rounded_y_pred1 + rounded_y_pred2, "Weeks")


Country (Argentina, Australia, Brazil, Canada, China, France, India, Italy, Japan, South Africa, Spain, UK, USA): USA
Development Unit (Cardiovascular, NeuroScience, Oncology, Respiratory): Oncology
Phase (Phase I, Phase II, Phase III, Phase IV): Phase III
Is this a new indication? (Yes, No): Yes
Enter Blinding (Double Blind, Open Label, Single Blind): Open Label
Is this pediatric only (Yes, No): Yes
Predicted CTT-FP Weeks: 50 Weeks
Predicted FP-FSI Weeks: 70 Weeks
TOTAL WEEKS FOR CTT-FSI: 120 Weeks


In [None]:
results = []
countries = ['Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'France', 'India', 'Italy', 'Japan', 'South Africa', 'Spain', 'UK', 'USA']

def get_user_input():
    # Prompting the user for inputs
    development_unit = input("Development Unit (Cardiovascular, NeuroScience, Oncology, Respiratory): ")
    phase = input("Phase (Phase I, Phase II, Phase III, Phase IV): ")
    new_indication = input("Is this a new indication? (Yes, No): ")
    blinding = input("Enter Blinding (Double Blind, Open Label, Single Blind): ")
    pediatric = input("Is this pediatric only (Yes, No): ")

    # Constructing the user input dictionary
    user_input = {
        'DevelopmentUnit': development_unit,
        'Phase': phase,
        'New Indication': new_indication,
        'Blinding': blinding,
        'PediatricOnly': pediatric
    }
    return user_input

def convert_to_dataframe(user_input, columns):

    input_df = pd.DataFrame(np.zeros((1, len(columns))), columns=columns)


    for key, value in user_input.items():
        column_name = f"{key}_{value}"
        if column_name in columns:
            input_df.at[0, column_name] = 1
        else:
            print(f"Column '{column_name}' not found in columns")

    return input_df


columns1 = pd.Index(['DevelopmentUnit_Cardiovascular', 'DevelopmentUnit_NeuroScience', 'DevelopmentUnit_Oncology', 'DevelopmentUnit_Respiratory',
                     'Phase_Phase I', 'Phase_Phase II', 'Phase_Phase III', 'Phase_Phase IV',
                     'New Indication_No', 'New Indication_Yes', 'Blinding_Double Blind', 'Blinding_Open Label', 'Blinding_Single Blind',
                     'PediatricOnly_No', 'PediatricOnly_Yes'])

columns2 = pd.Index(['Country_Argentina', 'Country_Australia', 'Country_Brazil', 'Country_Canada', 'Country_China', 'Country_France',
                     'Country_India', 'Country_Italy', 'Country_Japan', 'Country_South Africa', 'Country_Spain', 'Country_UK', 'Country_USA',
                     'DevelopmentUnit_Cardiovascular', 'DevelopmentUnit_NeuroScience', 'DevelopmentUnit_Oncology', 'DevelopmentUnit_Respiratory',
                     'Phase_Phase I', 'Phase_Phase II', 'Phase_Phase III', 'Phase_Phase IV',
                     'New Indication_No', 'New Indication_Yes', 'Blinding_Double Blind', 'Blinding_Open Label', 'Blinding_Single Blind',
                     'PediatricOnly_No', 'PediatricOnly_Yes'])


user_input = get_user_input()


user_input_CTT_FP = {
    'DevelopmentUnit': user_input['DevelopmentUnit'],
    'Phase': user_input['Phase'],
    'New Indication': user_input['New Indication'],
    'Blinding': user_input['Blinding'],
    'PediatricOnly': user_input['PediatricOnly']
}
for country in countries:
    user_input_FP_FSI = {
        'Country': country,
        'DevelopmentUnit': user_input['DevelopmentUnit'],
        'Phase': user_input['Phase'],
        'New Indication': user_input['New Indication'],
        'Blinding': user_input['Blinding'],
        'PediatricOnly': user_input['PediatricOnly']
    }


    input_df1 = convert_to_dataframe(user_input_CTT_FP, columns1)
    input_df2 = convert_to_dataframe(user_input_FP_FSI, columns2)


    y_pred1 = best_rf.predict(input_df1)
    y_pred2 = best_model.predict(input_df2)


    rounded_y_pred1 = round(y_pred1[0])
    rounded_y_pred2 = round(y_pred2[0])

    total_weeks = rounded_y_pred1 + rounded_y_pred2
    results.append((country, total_weeks))


results.sort(key=lambda x: x[1])


print("Top 5 countries:")
for country, weeks in results[:5]:
    print(f"{country}: {weeks} weeks")

Development Unit (Cardiovascular, NeuroScience, Oncology, Respiratory): Oncology
Phase (Phase I, Phase II, Phase III, Phase IV): Phase IV
Is this a new indication? (Yes, No): Yes
Enter Blinding (Double Blind, Open Label, Single Blind): Single Blind
Is this pediatric only (Yes, No): Yes
Top 5 countries:
Japan: 122 weeks
USA: 122 weeks
Australia: 123 weeks
Canada: 123 weeks
France: 123 weeks
