In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

# Load the training data
data = pd.read_csv('/home/smayan/Desktop/train.csv')

# Mapping the target statuses
status_mapping = {
    'Approved': 1, 'Phase 2': 2, 'Phase 1': 3, 'Phase 3': 4,
    'Investigative': 5, 'Phase 1/2': 6, 'Discontinued in Phase 2': 7,
    'Terminated': 8, 'Patented': 9, 'Discontinued in Phase 3': 10,
    'Discontinued in Phase 1': 11, 'Preclinical': 12, 'Withdrawn from market': 13,
    'Phase 2/3': 14, 'Phase 4': 15, 'Clinical trial': 16,
    'Preregistration': 17, 'Phase 1b': 18, 'Phase 2a': 19,
    'Discontinued in Preregistration': 20, 'Discontinued in Phase 1/2': 21,
    'Registered': 22, 'Approved (orphan drug)': 23, 'Application submitted': 24,
    'IND submitted': 25, 'Discontinued in Phase 2/3': 26, 'Phase 2b': 27,
    'Phase 0': 28, 'Discontinued in Phase 4': 29, 'BLA submitted': 30,
    'Phase 1/2a': 31, 'Discontinued in Phase 2b': 32, 'Phase 1b/2a': 33
}

# Map target status
data['Target_Status'] = data['Target_Status'].map(status_mapping)

# Prepare features and labels
X = data.drop(columns=['ID', 'Target_Status', 'TargetID', 'DRUGID', 'DRUGNAME', 'SEQUENCE', 'Accession Number'])
y_encoded = data['Target_Status']

# Feature engineering
X['DrugType_HighStatus'] = X['DRUGTYPE'].astype(str) + '_' + X['Drug_high_status'].astype(str)
X['DiseaseStatus_DrugStatus'] = X['Disease_of_highest_status'].astype(str) + '_' + X['Drug_Status'].astype(str)
X['Unique_TargetID'] = X['UNIPROID'].astype(str) + '_' + X['TARGNAME'].astype(str) + '_' + X['GENENAME'].astype(str)
X['BioClass_Function'] = X['BIOCLASS'].astype(str) + '_' + X['FUNCTION'].astype(str)

# Drop original categorical columns
X = X.drop(columns=['DRUGTYPE', 'Drug_high_status', 'Disease_of_highest_status', 'Drug_Status', 'UNIPROID', 'TARGNAME', 'GENENAME', 'BIOCLASS', 'FUNCTION'])

# One-hot encoding for categorical features
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Convert to NumPy arrays
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Define the neural network model
def create_nn_model(input_dim):
    model = keras.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(input_dim,)),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(len(status_mapping), activation='softmax')  # Number of classes for your target variable
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train the neural network
input_dim = X_train.shape[1]
nn_model = create_nn_model(input_dim)
nn_model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)

# Load and preprocess the test data
test_df = pd.read_csv('/home/smayan/Desktop/test.csv')
test_df['DrugType_HighStatus'] = test_df['DRUGTYPE'].astype(str) + '_' + test_df['Drug_high_status'].astype(str)
test_df['DiseaseStatus_DrugStatus'] = test_df['Disease_of_highest_status'].astype(str) + '_' + test_df['Drug_Status'].astype(str)
test_df['Unique_TargetID'] = test_df['UNIPROID'].astype(str) + '_' + test_df['TARGNAME'].astype(str) + '_' + test_df['GENENAME'].astype(str)
test_df['BioClass_Function'] = test_df['BIOCLASS'].astype(str) + '_' + test_df['FUNCTION'].astype(str)

# Drop original categorical columns
test_df_processed = test_df.drop(columns=['DRUGTYPE', 'Drug_high_status', 'Disease_of_highest_status', 'Drug_Status', 'UNIPROID', 'TARGNAME', 'GENENAME', 'BIOCLASS', 'FUNCTION'])

# One-hot encoding for test data
test_df_processed = pd.get_dummies(test_df_processed)

# Align test data columns with training data
test_df_processed = test_df_processed.reindex(columns=X.columns, fill_value=0)

# Make predictions on the test data
test_predictions = nn_model.predict(test_df_processed.to_numpy())
test_predictions = np.argmax(test_predictions, axis=1)  # Get the predicted class labels

# Prepare submission DataFrame
submission = pd.DataFrame({
    'ID': test_df['ID'],  
    'Prediction': test_predictions
})

# Reverse the status mapping
reverse_status_mapping = {v: k for k, v in status_mapping.items()}
submission['Prediction'] = submission['Prediction'].map(reverse_status_mapping)

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_nn.csv', index=False)

print("Submission file created successfully!")


2024-09-28 12:05:08.026098: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-28 12:05:08.100201: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-28 12:05:08.174975: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-28 12:05:08.237696: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-28 12:05:08.255370: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-28 12:05:08.372706: I tensorflow/core/platform/cpu_feature_gu

: 

: 

: 