In [1]:
# My First Kaggle Notebook -- 
# Started Kaggle now but not new to Data Science

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix
from sklearn.impute import KNNImputer

In [4]:
import lightgbm as lgb
from tqdm import tqdm

In [5]:
import optuna

In [6]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

In [7]:
train.rename(columns = {"Class" : "target"}, inplace = True)
train.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,target
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [8]:
one_hot_encoded = pd.get_dummies(train[['EJ']])
print(one_hot_encoded.head())

   EJ_A  EJ_B
0     0     1
1     1     0
2     0     1
3     0     1
4     0     1


In [9]:
for column in one_hot_encoded.columns:
   train.insert(len(train.columns), column , one_hot_encoded.loc[:, column])

In [10]:
train.drop(['EJ', 'Id'], axis=1, inplace=True)
target_column = train["target"]
train.drop(["target"], inplace=True, axis=1) # Remove target column for training the data


In [11]:
imputer = KNNImputer(n_neighbors=2)
imputed_to_fit_on_test = imputer.fit(train)
imputed_data = imputer.fit_transform(train)

In [12]:
train = pd.DataFrame(imputed_data, columns = train.columns)

In [13]:
scaled_to_fit_on_test =  MinMaxScaler().fit(train)

def standardizationMinMax(dataset) :
    scaler = MinMaxScaler()
    return scaler.fit_transform(dataset)

In [14]:
train = pd.DataFrame(standardizationMinMax(train) , columns = train.columns)

In [15]:
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
one_hot_encoded = pd.get_dummies(test[['EJ']])
print(one_hot_encoded.head())
test.drop(['EJ', 'Id'], axis=1, inplace=True)
for column in one_hot_encoded.columns:
   test.insert(len(test.columns), column , one_hot_encoded.loc[:, column])
if "EJ_B" not in test.columns:
    test["EJ_B"] = 0
if "EJ_A" not in test.columns:
    test["EJ_A"] = 1
    
test = pd.DataFrame(imputed_to_fit_on_test.transform(test), columns=test.columns) 


   EJ_A
0     1
1     1
2     1
3     1
4     1


In [16]:
def objective(trial) :
    params = {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "num_leaves": trial.suggest_int("num_leaves", 20, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    model = lgb.LGBMClassifier(**params, random_state=42, n_jobs=-1)
    
    scores = []
    for train_idx, test_idx in tqdm(skf.split(train,target_column)):
        x_train, x_test = train.loc[train_idx], train.loc[test_idx] 
        y_train, y_test = target_column.loc[train_idx] , target_column.loc[test_idx]

        model.fit(x_train, y_train)
        
        predictions = model.predict(x_test)
        score = accuracy_score(y_test, predictions)
        
        scores.append(score)
        
    
    return np.mean(scores)
    


In [17]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2023-08-05 17:06:48,969] A new study created in memory with name: no-name-8bc43187-08e7-4006-9b12-dcee2920dae8
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
5it [00:03,  1.42it/s]
[I 2023-08-05 17:06:52,516] Trial 0 finished with value: 0.9335562549173879 and parameters: {'learning_rate': 0.04338231150905962, 'n_estimators': 333, 'max_depth': 7, 'num_leaves': 549, 'subsample': 0.8740823916239701, 'colsample_bytree': 0.8174461653345658, 'min_child_samples': 17, 'min_split_gain': 0.31045733809815623, 'reg_alpha': 0.04933011958592881, 'reg_lambda': 0.738011712462248}. Best is trial 0 with value: 0.9335562549173879.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
5it [00:04,  1.18it/s]
[I 2023-08-05 17:06:56,766] Trial 1 finished with value: 0.8719381064778391 and parameters: {'learning_rate': 0.003372133369707105, 'n_estimators': 598, 'max_depth': 3, 'num_leaves': 122, 'subsample': 0.7344494189339308, 'colsample_bytree': 0.572141074

In [26]:
best_study = study.best_trial

FrozenTrial(number=2, state=TrialState.COMPLETE, values=[0.8249672174141096], datetime_start=datetime.datetime(2023, 8, 5, 17, 6, 56, 767749), datetime_complete=datetime.datetime(2023, 8, 5, 17, 7, 0, 64311), params={'learning_rate': 0.00010528656953854453, 'n_estimators': 465, 'max_depth': 4, 'num_leaves': 742, 'subsample': 0.5801452942400647, 'colsample_bytree': 0.6255791094574136, 'min_child_samples': 73, 'min_split_gain': 0.5803609086856948, 'reg_alpha': 0.38743183795216396, 'reg_lambda': 0.030194584054500706}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=True, low=0.0001, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'num_leaves': IntDistribution(high=1000, log=False, low=20, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'min_child_samples': IntDistribution(high=100, log=False, low=10, step=1), 'min_split_gain': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'reg_alpha': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'reg_lambda': FloatDistribution(high=1.0, log=False, low=0.0, step=None)}, trial_id=2, value=None)


{'learning_rate': 0.00010528656953854453,
 'n_estimators': 465,
 'max_depth': 4,
 'num_leaves': 742,
 'subsample': 0.5801452942400647,
 'colsample_bytree': 0.6255791094574136,
 'min_child_samples': 73,
 'min_split_gain': 0.5803609086856948,
 'reg_alpha': 0.38743183795216396,
 'reg_lambda': 0.030194584054500706}

In [34]:
best_params = study.best_params
# Create the final LGBM Classifier using the best hyperparameters
best_model = lgb.LGBMClassifier(**best_params, random_state=42, n_jobs=-1)

# Train the final model on the entire training dataset
best_model.fit(train, target_column)

# Make predictions on the test dataset
prediction = best_model.predict(test)

prediction

array([0, 0, 0, 0, 0])

In [35]:
y_proba = best_model.predict_proba(test)

In [37]:
# Calculate the probability for the positive class
positive_class_probability = y_proba[:, 1]

# Calculate the probability for the negative class
negative_class_probability = y_proba[:, 0]

# Print the probabilities
print("Positive class probability:", positive_class_probability)
print("Negative class probability:", negative_class_probability)

Positive class probability: [0.17692641 0.17692641 0.17692641 0.17692641 0.17692641]
Negative class probability: [0.82307359 0.82307359 0.82307359 0.82307359 0.82307359]


In [38]:
test_2 = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

In [39]:

submissions = pd.DataFrame(
    {"Id": test_2['Id'], "class_0": negative_class_probability, "class_1": 1 - negative_class_probability} 
)

In [40]:
submissions.to_csv("submission.csv", index=False)

In [41]:
pd.read_csv('/kaggle/working/submission.csv').head()

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.823074,0.176926
1,010ebe33f668,0.823074,0.176926
2,02fa521e1838,0.823074,0.176926
3,040e15f562a2,0.823074,0.176926
4,046e85c7cc7f,0.823074,0.176926
