In [22]:
import pandas as pd
import numpy as np

data = pd.read_csv("./data/2_preprcossing_train.csv", index_col='id')
data.head(5)
X = data.iloc[:, :-1]
y = data['y']
y

id
0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
41183    1.0
41184    0.0
41185    0.0
41186    1.0
41187    0.0
Name: y, Length: 41188, dtype: float64

In [23]:
from sklearn.model_selection import train_test_split

random_seed = 42
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=random_seed, stratify=y)

In [24]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy='most_frequent')
X_train = impute.fit_transform(X_train)
X_valid = impute.transform(X_valid)
X_train = pd.DataFrame(X_train, columns=X.columns)

In [82]:
def age_to_int(age, step=5, young_age=20, old_age=60, default=0, old_age_value=99):
    if age is None:
        return default
    if str(age).isnumeric():
        int_age = int(age)
        if int_age <= 0 or int_age >= 120:
            return default
        if int_age < young_age:
            return 1
        if int_age >= old_age:
            return old_age_value
        return (int_age - young_age + 1) // step + 1
    return default


X_train['age'] = X_train['age'].apply(lambda age: age_to_int(age))

In [25]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE

oversample = BorderlineSMOTE()
X_over, y_over = oversample.fit_resample(X_train, y_train)
y_over.value_counts()

0.0    27411
1.0    27411
Name: y, dtype: int64

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'min_samples_leaf': [150, 200, 300, 500, 1000, 1500, 2000, None]
}
rfc = RandomForestClassifier(
    bootstrap=True,
    max_samples=15000,
    max_depth=10,
    random_state=random_seed)
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_over, y_over)

from sklearn.metrics import f1_score

y_train_pred = grid_search.predict(X_over)
y_valid_pred = grid_search.predict(X_valid)

print(f1_score(y_over, y_train_pred))
print(f1_score(y_valid, y_valid_pred))

0.8926339641346103
0.5482534043812907




In [39]:
grid_search.best_params_

{'min_samples_leaf': 150}

In [26]:
from sklearn.ensemble import RandomForestClassifier

params = {'bootstrap': [True],
          'max_samples': [10000, 20000, 30000],
          'n_estimators': [100],
          'max_depth': [3,4,5,6,7,8,9, 10, 11, 12, None],
          'max_features': [10, 15, 20, 25],
          'min_samples_leaf': [50, 100, 150, 200, 300, 500, 1000]
          }

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV

rfc = RandomForestClassifier(random_state=random_seed)
random_search = HalvingRandomSearchCV(estimator=rfc, param_distributions=params,
                                      random_state=random_seed,
                                      scoring='f1')
random_search.fit(X_over, y_over)

4620 fits failed out of a total of 4620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
616 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda\lib\site-packages\sklearn\ensemble\_forest.py", line 379, in fit
    n_samples_bootstrap = _get_n_samples_bootstrap(
  File "D:\anaconda\lib\site-packages\sklearn\ensemble\_forest.py", line 111, in _get_n_samples_bootstrap
    raise ValueError(msg.format(n_samples, max_samples))
ValueError: `max_samples` must be in range 1 to 15 but got value 10000

----------------------------------------------------

HalvingRandomSearchCV(estimator=RandomForestClassifier(random_state=42),
                      param_distributions={'bootstrap': [True],
                                           'max_depth': [3, 4, 5, 6, 7, 8, 9,
                                                         10, 11, 12, None],
                                           'max_features': [10, 15, 20, 25],
                                           'max_samples': [10000, 20000, 30000],
                                           'min_samples_leaf': [50, 100, 150,
                                                                200, 300, 500,
                                                                1000],
                                           'n_estimators': [100]},
                      random_state=42, scoring='f1')

In [29]:
from sklearn.metrics import f1_score

y_train_pred = random_search.predict(X_over)
y_valid_pred = random_search.predict(X_valid)

print(f1_score(y_over, y_train_pred))
print(f1_score(y_valid, y_valid_pred))

0.8722417229640345
0.5224827586206896




In [13]:
random_search.best_params_

{'min_samples_leaf': 100,
 'max_leaf_nodes': 127,
 'max_depth': 7,
 'n_estimators': 16}

In [50]:
# param_grid = {
#     'min_samples_leaf': [100, 200, 300, 400, 500, 1000]
# }
from sklearn.experimental.enable_halving_search_cv import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier(max_depth=12, n_estimators=300, random_state=42)
gs = HalvingGridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_weighted', cv=5)
gs.fit(X_over, y_over)
print("best score: %.4f, and best params: %s" % (gs.best_score_, gs.best_params_))
y_pred = gs.predict(X_valid)
print("LightGBM f1_score: %.4f%%" % (f1_score(y_valid, y_pred) * 100))

best score: 0.8948, and best params: {'min_samples_leaf': 100}
LightGBM f1_score: 57.0847%


In [28]:
from lightgbm.sklearn import LGBMClassifier

lgb = LGBMClassifier(max_depth=10, n_estimators=500, random_state=42)
lgb.fit(X_over, y_over)

train_pred = lgb.predict(X_train)
test_pred = lgb.predict(X_valid)
print(f1_score(train_pred, y_train))
print(f1_score(y_valid, test_pred))


0.8613021916493361
0.5312810327706058


In [52]:
test = pd.read_csv("./data/2_preprcossing_test.csv", index_col='id')
test = test.iloc[:, :-1]
test_columns = test.columns
test_index = test.index
test = impute.transform(test)
test_y = gs.predict(test)
test_y = ['yes' if y == 1 else 'no' for y in test_y]
test_y = pd.DataFrame(test_y, index=test_index, columns=['Predicted_Results'])
test_y.to_csv("././data/方笠_result.csv")