<a href="https://colab.research.google.com/github/Tiabet/DACON_WebClick/blob/main/WebClick_v6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import random
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_parquet('/content/drive/MyDrive/data/train.parquet')
test =  pd.read_parquet('/content/drive/MyDrive/data/test.parquet')

In [3]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [4]:
test.drop('ID', axis = 1, inplace = True)
train.drop('ID', axis = 1, inplace = True)
train.head()

Unnamed: 0,Click,F01,F02,F03,F04,F05,F06,F07,F08,F09,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,1,NSLHFNS,AVKQTCL,DTZFPRW,114.0,ISVXFVA,1,PQZBVMG,LPYPUNA,IZYJZDA,...,NZGEZLW,GTISJWW,380.0,2.0,AXQFZWC,IRUDRFB,,TFJMLCZ,0.0,AURZYDY
1,0,VGIVWZQ,LSUSMVO,PQGWFJZ,26.0,NFRVLWS,43,IMPIGJT,MIGYEEG,NGODWIN,...,NZGEZLW,GTISJWW,466.0,1.0,DRVVDHZ,IRUDRFB,19.0,AUGTURV,0.0,LUZRMLU
2,0,JCDXFYU,PILDDJU,IAGJDOH,119.0,LFPUEOV,0,FFUTIRZ,OFKQGTY,BEZTQIO,...,VHXETCF,KHZNEZF,197.0,0.0,QMOULXS,IRUDRFB,8.0,ZVSTLNM,0.0,MHBRSQK
3,1,PSMFWTP,ZYAVJHP,,15.0,ATQPZSJ,26,ZDTZNSB,THBWWCD,LTETYBG,...,IVIRTPR,GTISJWW,8640.0,0.0,IZLJUJS,IRUDRFB,14.0,ZBSRLCQ,0.0,GAZBSSZ
4,0,SLCRICD,QPQWGXA,,13.0,CHZGJZR,20,PQZBVMG,MIGYEEG,LJBQPJW,...,NZGEZLW,WHSRKIM,41774.0,0.0,BHBIZCL,IRUDRFB,13.0,QHYLSBX,0.0,QTATWAY


In [5]:
# Step 1: Identify the categories in the train set that appear 100,000 times or more
category_thresholds = {}
for column in train.columns:
    if column != 'Click':
        value_counts = train[column].value_counts()

        categories_to_keep = value_counts[value_counts >= 10].index

        category_thresholds[column] = categories_to_keep

        train.loc[~train[column].isin(categories_to_keep), column] = pd.NA

# Step 2: Apply these categories to the test set
for column in test.columns:
    if column != 'Click' and column in category_thresholds:
        categories_to_keep = category_thresholds[column]

        # Replace values in test set that are not in the categories_to_keep with NaN
        test.loc[~test[column].isin(categories_to_keep), column] = pd.NA

In [6]:
import numpy as np

# Function to fill NaNs based on existing distribution
def fill_na_with_distribution(column):
    counts = column.dropna().value_counts(normalize=True)
    nans_to_fill = column.isna().sum()
    fill_values = np.random.choice(counts.index, size=nans_to_fill, p=counts.values)
    column.loc[column.isna()] = fill_values
    return column

# Apply the function to each column in DataFrame
train = train.apply(fill_na_with_distribution, axis=0)
test = test.apply(fill_na_with_distribution, axis=0)

In [7]:
train_x = train.drop('Click', axis = 1)
train_y = train['Click']

In [12]:
train_y = pd.DataFrame(train_y, columns = ['Click'])
train_y.head()

Unnamed: 0,Click
0,1
1,0
2,0
3,1
4,0


In [8]:
del train

In [9]:
# Label encode categorical columns
def label_encode_columns(train_df, test_df):
    le = LabelEncoder()
    for column in train_df.columns:
        if train_df[column].dtype == 'object' or isinstance(train_df[column].dtype, pd.CategoricalDtype):
            # Fit the LabelEncoder on the combined data to ensure consistency
            combined_data = pd.concat([train_df[column], test_df[column]], axis=0).astype(str)
            le.fit(combined_data)
            train_df[column] = le.transform(train_df[column].astype(str))
            test_df[column] = le.transform(test_df[column].astype(str))
    return train_df, test_df

# Apply label encoding to train_x and test_x
train_x, test_x = label_encode_columns(train_x, test)

In [14]:
train_x

Unnamed: 0,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,44135,1565,5,114.0,43444,1.0,33683,24,6355,47639,...,9902,4,380.0,2.0,64175,0,7.0,5433,0.0,190
1,82375,23578,32,26.0,51232,43.0,18405,28,9474,20627,...,9902,4,466.0,1.0,14329,0,19.0,230,0.0,3031
2,35002,30739,18,119.0,43444,0.0,11330,33,858,82817,...,15150,7,197.0,0.0,64175,0,8.0,7436,0.0,3143
3,60698,52120,18,15.0,2896,26.0,54199,51,8350,20988,...,6275,4,8640.0,0.0,34892,0,14.0,7193,0.0,1567
4,75935,33242,18,13.0,37369,20.0,33683,28,8037,62701,...,9902,13,41774.0,0.0,4854,0,13.0,4610,0.0,4342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28605386,52076,39404,5,7.0,76144,1.0,37578,50,6355,86850,...,13501,4,43.0,3.0,60413,0,6.0,5433,0.0,6481
28605387,80815,29890,18,2.0,21119,26.0,13619,15,7399,82817,...,3647,4,4341.0,0.0,52547,0,6.0,304,0.0,3062
28605388,67747,43759,21,92.0,43444,3.0,21394,4,11462,70680,...,13069,12,25.0,2.0,11548,0,20.0,4130,0.0,6057
28605389,6418,10440,17,91.0,36176,0.0,45102,8,4484,60231,...,17205,4,1753.0,0.0,42899,0,6.0,3046,0.0,3866


In [17]:
train_y.index

MultiIndex([(0, 17941073),
            (0,  2759325),
            (0, 19499653),
            (0, 25864799),
            (0, 19896746),
            (0, 12507827),
            (0, 12185909),
            (0, 19312056),
            (0, 11372526),
            (0, 11114810),
            ...
            (1, 17044301),
            (1, 13616152),
            (1, 21378286),
            (1, 22142025),
            (1, 21366218),
            (1, 23347251),
            (1,  5993918),
            (1, 11148324),
            (1, 26676289),
            (1,  2360584)],
           names=['Click', None], length=11000000)

In [18]:
# Perform the sampling on train_y and preserve the original indices
train_y = train_y.groupby('Click').apply(lambda x: x.sample(min(len(x), 5500000)))

# Flatten the MultiIndex to use for selecting rows from train_x
sampled_indices = train_y.index.get_level_values(1)

# Select corresponding rows from train_x using the sampled indices
train_x = train_x.loc[sampled_indices]

# Ensure the indices match between train_x_sampled and train_y_sampled
train_y.index = train_x.index

In [29]:
train_y

Unnamed: 0,Click
17941073,0
2759325,0
19499653,0
25864799,0
19896746,0
...,...
23347251,1
5993918,1
11148324,1
26676289,1


In [None]:
# train_x.to_parquet('drive/MyDrive/data/train_v6.parquet', compression = 'gzip', engine = 'pyarrow', index = False)
# test_x.to_parquet('drive/MyDrive/data/test_v6.parquet', compression = 'gzip', engine = 'pyarrow', index = False)
# train_y.to_csv('drive/MyDrive/data/trainy_v6.csv', index = False)

In [23]:
!pip install flaml

Collecting flaml
  Downloading FLAML-2.1.2-py3-none-any.whl (296 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m286.7/296.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: flaml
Successfully installed flaml-2.1.2


In [24]:
from flaml import AutoML

In [25]:
# Initialize AutoML
automl = AutoML()

automl_settings = {
    "time_budget": 3600,  # Total time budget in seconds
    "metric": 'roc_auc',  # Evaluation metric
    "task": 'classification',  # Task type
    "log_file_name": 'automl.log',  # Log file
    #"estimator_list": ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree'],  # List of estimators to use
    "estimator_list": ['lgbm'],
    "eval_method": "holdout",  # Use holdout validation method
    "split_ratio": 0.2,  # Ratio of data to be used as validation set
    "early_stop": 10
}


In [30]:
# Fit the model
automl.fit(X_train=train_x, y_train=train_y['Click'], **automl_settings)

[flaml.automl.logger: 06-02 13:21:09] {1680} INFO - task = classification
[flaml.automl.logger: 06-02 13:21:09] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-02 13:21:44] {1789} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 06-02 13:21:44] {1901} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 06-02 13:21:44] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-02 13:21:45] {2345} INFO - Estimated sufficient time budget=13754870s. Estimated necessary time budget=13755s.
[flaml.automl.logger: 06-02 13:21:45] {2392} INFO -  at 193.2s,	estimator lgbm's best error=0.3695,	best estimator lgbm's best error=0.3695
[flaml.automl.logger: 06-02 13:21:45] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-02 13:21:47] {2392} INFO -  at 194.7s,	estimator lgbm's best error=0.3695,	best estimator lgbm's best error=0.3695
[flaml.automl.logger: 06-02 13:21:47] {2219} INFO - iteration 2, current lea

KeyboardInterrupt: 

LGBMClassifier(colsample_bytree=0.24778382579757618,
               learning_rate=0.009464363295991879, max_bin=1023,
               min_child_samples=7, n_estimators=1, n_jobs=-1, num_leaves=92,
               reg_alpha=0.0034922118383222253, reg_lambda=0.017555446651738044,
               verbose=-1)

In [None]:
sample_submission = pd.read_csv('drive/MyDrive/data/sample_submission.csv')

# Predict probabilities on the test set
y_pred_test_proba = automl.predict_proba(test_x)[:, 1]

# Output the probabilities for test set
print(y_pred_test_proba)

In [None]:
sample_submission['Click'] = y_pred_test_proba
sample_submission

In [None]:
sample_submission.to_csv("drive/MyDrive/data/lgbm_prediction_v6-1.csv",index = False)