In [1]:
# My First Kaggle Notebook -- 
# Started Kaggle now but not new to Data Science

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix
from sklearn.impute import KNNImputer

In [4]:
import lightgbm as lgb
from tqdm import tqdm

In [5]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

In [6]:
train.rename(columns = {"Class" : "target"}, inplace = True)
train.head()

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,target
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.2299,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.05481,3.396778,102.15198,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1


In [7]:
one_hot_encoded = pd.get_dummies(train[['EJ']])
print(one_hot_encoded.head())

   EJ_A  EJ_B
0     0     1
1     1     0
2     0     1
3     0     1
4     0     1


In [8]:
for column in one_hot_encoded.columns:
   train.insert(len(train.columns), column , one_hot_encoded.loc[:, column])

In [9]:
train.drop(['EJ', 'Id'], axis=1, inplace=True)
target_column = train["target"]
train.drop(["target"], inplace=True, axis=1) # Remove target column for training the data


In [10]:
imputer = KNNImputer(n_neighbors=2)
imputed_to_fit_on_test = imputer.fit(train)
imputed_data = imputer.fit_transform(train)

In [11]:
train = pd.DataFrame(imputed_data, columns = train.columns)

In [12]:
scaled_to_fit_on_test =  MinMaxScaler().fit(train)

def standardizationMinMax(dataset) :
    scaler = MinMaxScaler()
    return scaler.fit_transform(dataset)

In [13]:
train = pd.DataFrame(standardizationMinMax(train) , columns = train.columns)

## Cross-Validation 
## GridSearchCV

In [14]:
param_distributions = {
    "learning_rate": np.arange(0.01, 0.1, 0.01),
    "num_leaves": np.arange(31, 127, 7),
    "max_depth": np.arange(3, 10, 1),
    "min_child_samples": np.arange(20, 100, 10),
    "subsample": np.arange(0.6, 1.0, 0.05),
    "colsample_bytree": np.arange(0.6, 1.0, 0.05),
}

In [15]:
grid_search  = RandomizedSearchCV(lgb.LGBMClassifier(), param_distributions, n_iter=100 , scoring="neg_log_loss", cv=5, verbose=1)

In [16]:
grid_search.fit(train, target_column)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [17]:
#import sklearn
#sklearn.metrics.get_scorer_names()

In [18]:
best_params = grid_search.best_params_
best_lgb_model = grid_search.best_estimator_

In [19]:
best_params

{'subsample': 0.8500000000000002,
 'num_leaves': 73,
 'min_child_samples': 30,
 'max_depth': 6,
 'learning_rate': 0.06999999999999999,
 'colsample_bytree': 0.8000000000000002}

In [20]:
best_lgb_model

In [21]:
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

In [22]:
one_hot_encoded = pd.get_dummies(test[['EJ']])
print(one_hot_encoded.head())

   EJ_A
0     1
1     1
2     1
3     1
4     1


In [23]:
test.drop(['EJ', 'Id'], axis=1, inplace=True)

In [24]:
for column in one_hot_encoded.columns:
   test.insert(len(test.columns), column , one_hot_encoded.loc[:, column])

In [25]:
if "EJ_B" not in test.columns:
    test["EJ_B"] = 0
if "EJ_A" not in test.columns:
    test["EJ_A"] = 1

In [26]:
test = pd.DataFrame(imputed_to_fit_on_test.transform(test), columns=test.columns)  

In [27]:
test = pd.DataFrame(scaled_to_fit_on_test.transform(test), columns=test.columns)

In [28]:
predictions = best_lgb_model.predict(test)
predictions

array([0, 0, 0, 0, 0])

In [29]:
y_proba = best_lgb_model.predict_proba(test)
y_proba

array([[0.70242742, 0.29757258],
       [0.70242742, 0.29757258],
       [0.70242742, 0.29757258],
       [0.70242742, 0.29757258],
       [0.70242742, 0.29757258]])

In [30]:
# Calculate the probability for the positive class
positive_class_probability = y_proba[:, 1]

# Calculate the probability for the negative class
negative_class_probability = y_proba[:, 0]

# Print the probabilities
print("Positive class probability:", positive_class_probability)
print("Negative class probability:", negative_class_probability)

Positive class probability: [0.29757258 0.29757258 0.29757258 0.29757258 0.29757258]
Negative class probability: [0.70242742 0.70242742 0.70242742 0.70242742 0.70242742]


In [31]:
test_2 = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

In [32]:

submissions = pd.DataFrame(
    {"Id": test_2['Id'], "class_0": negative_class_probability, "class_1": 1 - negative_class_probability} 
)

In [33]:
submissions.to_csv("submission.csv", index=False)

In [34]:
pd.read_csv('/kaggle/working/submission.csv').head()

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.702427,0.297573
1,010ebe33f668,0.702427,0.297573
2,02fa521e1838,0.702427,0.297573
3,040e15f562a2,0.702427,0.297573
4,046e85c7cc7f,0.702427,0.297573
