In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import re

In [5]:
train_df = pd.read_csv('/home/smayan/Desktop/AI-ML-DS/NSDC/train.csv')
test_df = pd.read_csv('/home/smayan/Desktop/AI-ML-DS/NSDC/test.csv')

In [6]:
def preprocess_text(text):
    if pd.isna(text) or text == '':
        return ''
    if isinstance(text, (list, np.ndarray)):
        text = ' '.join(map(str, text))
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
    return ' '.join(text.split())

In [7]:
text_columns = ['DRUGNAME', 'Disease_of_highest_status', 'TARGNAME', 'GENENAME', 'SYNONYMS', 'FUNCTION', 'BIOCLASS', 'Disease']
for col in text_columns:
    train_df[col] = train_df[col].apply(preprocess_text)
    test_df[col] = test_df[col].apply(preprocess_text)

In [8]:
train_df['combined_text'] = train_df[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
test_df['combined_text'] = test_df[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [9]:
numeric_columns = ['PUBCHCID']
categorical_columns = ['DRUGTYPE', 'Drug_high_status', 'Drug_Status']

In [10]:
X = train_df[numeric_columns + categorical_columns + ['combined_text']].copy()
y = train_df['Target_Status']


In [12]:
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

In [13]:
for col in numeric_columns:
    X.loc[:, col] = pd.to_numeric(X[col], errors='coerce')

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('text', TfidfVectorizer(max_features=1000), 'combined_text')
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, n_jobs=-1))  
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 5],
    'classifier__learning_rate': [0.01, 0.1]
}

In [15]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1, error_score='raise')
grid_search.fit(X_train, y_train)

In [None]:
y_pred = grid_search.predict(X_val)
f1 = f1_score(y_val, y_pred, average='weighted')
print(f"Validation F1 Score: {f1}")

In [None]:
best_model = grid_search.best_estimator_
best_model.fit(X, y_encoded)

In [None]:
X_test = test_df[numeric_columns + categorical_columns + ['combined_text']].copy()

for col in numeric_columns:
    X_test.loc[:, col] = pd.to_numeric(X_test[col], errors='coerce')

In [None]:

test_predictions_encoded = best_model.predict(X_test)

test_predictions = le_target.inverse_transform(test_predictions_encoded)

submission = pd.DataFrame({'ID': test_df['ID'], 'Prediction': test_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file created.")