In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV

# Load data
train_df = pd.read_csv('/content/drive/MyDrive/samsung_blackbox_data/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/samsung_blackbox_data/test.csv')

# "버전 1" 전처리: x_6 제거 및 이상치 제거
def remove_outliers(df):
    # 숫자형 피처만 선택
    numeric_df = df.select_dtypes(include=[np.number])
    Q1 = numeric_df.quantile(0.25)
    Q3 = numeric_df.quantile(0.75)
    IQR = Q3 - Q1
    # 이상치를 제거하고, 원본 데이터프레임의 인덱스에 맞춰서 반환
    df_out = df[~((numeric_df < (Q1 - 1.5 * IQR)) | (numeric_df > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df_out

# 1. `x_6` 피처 제거
train_df_v1 = train_df.drop(columns=['x_6'])
test_df_v1 = test_df.drop(columns=['x_6'])

# 2. 이상치 제거
train_df_v1 = remove_outliers(train_df_v1)

# Features and target
X = train_df_v1.iloc[:, 1:-1]  # Features (x_0 to x_10 except x_6)
y = train_df_v1.iloc[:, -1]  # Target (label)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model
xgb_model = xgb.XGBRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=3, scoring='neg_mean_squared_error',
                           verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {-grid_search.best_score_:.4f}")

# Train the model with the best parameters on the full training data
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predict on the test data
X_test = test_df_v1.iloc[:, 1:]  # Test features
y_pred = best_model.predict(X_test)

# Identify top 10% of predicted values
threshold = np.percentile(y_pred, 90)
top_10_percent_mask = y_pred >= threshold

# Create submission file with ID and y columns (y = 1 for top 10%, 0 otherwise)
submission_df = pd.DataFrame({
    'ID': test_df_v1.iloc[:, 0],  # Assuming first column is ID in test.csv
    'y': np.where(top_10_percent_mask, 1, 0)  # 1 for top 10%, 0 otherwise
})

# Save the submission file
submission_df.to_csv('/content/drive/MyDrive/samsung_blackbox_data/xgboost_submission_after_data_preprocessing.csv', index=False)

print(f"Top 10% threshold: {threshold:.4f}")
print(f"Number of samples in top 10%: {sum(top_10_percent_mask)}")


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
Best parameters found: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 0.7}
Best cross-validation score: 1.7038
Top 10% threshold: 85.8615
Number of samples in top 10%: 504
