In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import re

2024-09-27 19:14:41.025157: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-27 19:14:41.028013: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-27 19:14:41.036978: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-27 19:14:41.052371: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-27 19:14:41.056865: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-27 19:14:41.067385: I tensorflow/core/platform/cpu_feature_gu

In [2]:
train_df = pd.read_csv('/home/smayan/Desktop/train.csv')
test_df = pd.read_csv('/home/smayan/Desktop/test.csv')

In [3]:
def preprocess_text(text):
    if pd.isna(text) or text == '':
        return ''
    if isinstance(text, (list, np.ndarray)):
        text = ' '.join(map(str, text))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
    return ' '.join(text.split())

In [4]:
text_columns = ['DRUGNAME', 'Disease_of_highest_status', 'TARGNAME', 'GENENAME', 'SYNONYMS', 'FUNCTION', 'BIOCLASS', 'Disease']
for col in text_columns:
    train_df[col] = train_df[col].apply(preprocess_text)
    test_df[col] = test_df[col].apply(preprocess_text)

In [5]:
train_df['combined_text'] = train_df[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
test_df['combined_text'] = test_df[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [6]:
numeric_columns = ['PUBCHCID']
categorical_columns = ['DRUGTYPE', 'Drug_high_status', 'Drug_Status']

In [7]:
X = train_df[numeric_columns + categorical_columns + ['combined_text']].copy()
y = train_df['Target_Status']


In [8]:
len(y.unique())

33

In [9]:
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

In [10]:
for col in numeric_columns:
    X.loc[:, col] = pd.to_numeric(X[col], errors='coerce')

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [11]:
model = Sequential([ 
        Dense(256, activation = 'relu'),
        Dense(128, activation = 'relu'),
        Dense(64, activation='relu'),
        Dense(33, activation = 'softmax')    
    ]
)
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
)

model.fit(
    X_train,y_train,
    epochs=1000
)

I0000 00:00:1727444696.847737   13042 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-09-27 19:14:56.848962: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('text', TfidfVectorizer(max_features=1000), 'combined_text')
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('NN', model)  
])

In [None]:
y_pred = pipeline.predict(X_val)
f1 = f1_score(y_val, y_pred, average='weighted')
print(f"Validation F1 Score: {f1}")

In [None]:
X_test = test_df[numeric_columns + categorical_columns + ['combined_text']].copy()

for col in numeric_columns:
    X_test.loc[:, col] = pd.to_numeric(X_test[col], errors='coerce')

In [None]:

test_predictions_encoded = pipeline.predict(X_test)

test_predictions = le_target.inverse_transform(test_predictions_encoded)

submission = pd.DataFrame({'ID': test_df['ID'], 'Prediction': test_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file created.")