In [79]:
import pandas as pd
import  numpy as np

data = pd.read_csv("./data/2_preprcossing_train.csv", index_col='id')
data.head(5)
X = data.iloc[:, :-1]
y = data['y']
y

id
0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
41183    1.0
41184    0.0
41185    0.0
41186    1.0
41187    0.0
Name: y, Length: 41188, dtype: float64

In [80]:
from sklearn.model_selection import train_test_split
random_seed = 42
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=random_seed, stratify=y)

In [81]:
from sklearn.impute import SimpleImputer
impute = SimpleImputer(strategy='most_frequent')
X_train = impute.fit_transform(X_train)
X_valid = impute.transform(X_valid)
X_train = pd.DataFrame(X_train, columns=X.columns)

In [82]:
def age_to_int(age, step=5, young_age=20, old_age=60, default=0, old_age_value=99):
    if age is None:
        return default
    if str(age).isnumeric():
        int_age = int(age)
        if int_age <= 0 or int_age >= 120:
            return default
        if int_age < young_age:
            return 1
        if int_age >= old_age:
            return old_age_value
        return (int_age - young_age + 1) // step + 1
    return default
X_train['age'] = X_train['age'].apply(lambda age: age_to_int(age))

In [83]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
oversample = BorderlineSMOTE()
X_over, y_over = oversample.fit_resample(X_train, y_train)
y_over.value_counts()

0.0    27411
1.0    27411
Name: y, dtype: int64

In [84]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=12, n_estimators=200, random_state=42)
model.fit(X_over, y_over)

y_train_pred = model.predict(X_over)
y_valid_pred = model.predict(X_valid)

from sklearn.metrics import f1_score
print(f1_score(y_over, y_train_pred))
print(f1_score(y_valid, y_valid_pred))

0.9512940919515125
0.5732038834951456


In [85]:
from sklearn.svm import SVC
svc = SVC(C=0.1, random_state=42)
svc.fit(X_over, y_over)

y_train_pred = model.predict(X_over)
y_valid_pred = model.predict(X_valid)

print(f1_score(y_over, y_train_pred))
print(f1_score(y_valid, y_valid_pred))

0.9512940919515125
0.5732038834951456


In [50]:
# param_grid = {
#     'min_samples_leaf': [100, 200, 300, 400, 500, 1000]
# }
from sklearn.experimental.enable_halving_search_cv import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier(max_depth=12, n_estimators=300, random_state=42)
gs = HalvingGridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_weighted', cv=5)
gs.fit(X_over, y_over)
print("best score: %.4f, and best params: %s" % (gs.best_score_, gs.best_params_))
y_pred = gs.predict(X_valid)
print("LightGBM f1_score: %.4f%%" % (f1_score(y_valid, y_pred) * 100))

best score: 0.8948, and best params: {'min_samples_leaf': 100}
LightGBM f1_score: 57.0847%


In [51]:
from lightgbm.sklearn import LGBMClassifier
lgb =LGBMClassifier(max_depth=10, n_estimators=500, random_state=42)
lgb.fit(X_over, y_over)

train_pred = lgb.predict(X_train)
test_pred = lgb.predict(X_valid)
print(f1_score(train_pred,y_train))
print(f1_score(y_valid, test_pred))


0.8558994197292069
0.512639029322548


In [52]:
test = pd.read_csv("./data/2_preprcossing_test.csv", index_col='id')
test = test.iloc[:, :-1]
test_columns = test.columns
test_index = test.index
test = impute.transform(test)
test_y = gs.predict(test)
test_y = ['yes' if y==1 else 'no' for y in test_y]
test_y = pd.DataFrame(test_y, index=test_index, columns=['Predicted_Results'])
test_y.to_csv("././data/方笠_result.csv")