In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.metrics import f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load train, test and feamat datasets
df_train = pd.read_csv("/kaggle/input/the-toxicity-prediction-challenge/train.csv")
df_feamat = pd.read_csv("/kaggle/input/the-toxicity-prediction-challenge/feamat.csv")
df_test = pd.read_csv("/kaggle/input/the-toxicity-prediction-challenge/test.csv")

# Select the prediction target. Here the prediction target is column 'Expected'
y = df_train["Expected"].to_numpy()

#split the Id column in train.csv and x column in test.csv into chemical id and assay id 
df_train[['ChemId','AssayId']] = df_train.Id.str.split(";",expand=True) 
df_test[['ChemId','AssayId']] = df_test.x.str.split(";",expand=True) 

# Rename V1 column in feamat to ChemId for mapping b/w train, test datasets with feamat
df_feamat.rename(columns = {'V1':'ChemId'}, inplace = True) 

# Remove all columns with more no of 0's and 1's
df_feamat = df_feamat.loc[:, (~df_feamat.isin([0,1])).any(axis = 0)]

# Replace Nan and infinte values with mean
df_feamat.replace([np.inf, -np.inf], np.nan, inplace =True)
df_feamat['V15'].fillna(value=df_feamat['V15'].mean(), inplace = True)

# Merge feamat with test and train
df_train = df_train.merge(df_feamat, on="ChemId", how="left")
df_test = df_test.merge(df_feamat, on="ChemId", how="left")

final_df_train = df_train.drop(['ChemId', 'Id','V2', 'Expected'], axis = 1).to_numpy()
final_df_test = df_test.drop(['ChemId','x', 'V2'], axis = 1).to_numpy()
df_train['AssayId'] = df_train.AssayId.astype(int)
df_test['AssayId'] = df_test.AssayId.astype(int)

folds = StratifiedKFold(n_splits = 10,random_state = None, shuffle = False) 
for train_index, test_index in folds.split(final_df_train, y):
    X_train, X_test, y_train, y_test = final_df_train[train_index], final_df_train[test_index], y[train_index], y[test_index]
   
model = XGBClassifier(n_estimators = 400, max_depth=8)
model.fit(X_train, y_train)
prediction_test = model.predict(X_test)
cv= cross_val_score(model,X_train,y_train,cv=folds)
print(cv.mean())
print(f1_score(y_test, prediction_test, average = "macro"))

prediction = model.predict(final_df_test)
output_file = pd.DataFrame({'Id':  df_test.x, 'Predicted': prediction})
print(output_file)
output_file.to_csv('XGBClassifierOutput.csv', index=False)
print("Ouput file created")



/kaggle/input/the-toxicity-prediction-challenge/sample_submission.csv
/kaggle/input/the-toxicity-prediction-challenge/feamat.csv
/kaggle/input/the-toxicity-prediction-challenge/features_id_name_mappings.csv
/kaggle/input/the-toxicity-prediction-challenge/train.csv
/kaggle/input/the-toxicity-prediction-challenge/test.csv












































0.9086002702258655
0.7995473277661882
                     Id  Predicted
0          88-60-8;1682          2
1      122931-48-0;1656          2
2        NOCAS_47311;36          2
3       55589-62-3;1850          2
4         79902-63-9;30          2
...                 ...        ...
11134    141517-21-7;38          2
11135        81-90-3;34          2
11136   74223-64-6;1640          2
11137        62-73-7;28          2
11138    2634-33-5;1855          2

[11139 rows x 2 columns]
Ouput file created
