In [22]:
!pip install --upgrade scikit-learn



In [17]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline

# Load the data
datafile_train = "/content/Property_train.csv"
datafile_test = "/content/Property_test_share.csv"
bd_train = pd.read_csv(datafile_train)
bd_test = pd.read_csv(datafile_test)

# Target column and feature engineering
target = 'Junk'
bd_train['data'] = 'train'
bd_test['data'] = 'test'
all_data = pd.concat([bd_train, bd_test], axis=0, sort=False)

# Handle missing numeric data and convert columns to numeric
cat_to_numeric = ['PriceIndex'+str(i) for i in range(1, 10)]
for col in cat_to_numeric:
    all_data[col] = pd.to_numeric(all_data[col], errors='coerce')
    all_data[col] = all_data[col].fillna(all_data.loc[all_data['data'] == 'train', col].median())

# Encoding categorical features
cat_cols = ['InteriorsStyle', 'ListDate', 'Material', 'Agency', 'AreaIncomeType',
            'EnvRating', 'PRIMEUNIT', 'Channel', 'PlotType', 'Architecture',
            'Region', 'SubModel', 'Facade', 'State', 'RegionType', 'Zip']

# Apply one-hot encoding for the categorical columns with more than 1000 occurrences
for col in cat_cols:
    k = all_data[col].value_counts()
    cats = k[k >= 1000].index[:-1]  # Keep categories with at least 1000 occurrences
    for cat in cats:
        name = col + '_' + str(cat)
        all_data[name] = (all_data[col] == cat).astype(int)

    # Drop original categorical column
    del all_data[col]

# Separate train and test datasets
x_train = all_data.drop([target, 'data'], axis=1)[all_data['data'] == 'train']
y_train = all_data[target][all_data['data'] == 'train']
x_test = all_data.drop([target, 'data'], axis=1)[all_data['data'] == 'test']

# Check for missing values and drop them if present
if y_train.isnull().sum() > 0:
    print(f"Dropping {y_train.isnull().sum()} rows with missing target values.")
    x_train = x_train[y_train.notna()]
    y_train = y_train[y_train.notna()]
else:
    print("No missing values in y_train.")

# Check if x_train or y_train is empty after filtering
if x_train.empty or y_train.empty:
    print("x_train or y_train is empty after filtering, adjust data preprocessing.")
else:
    # Split the dataset into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    # Define the model: GradientBoostingClassifier
    pipeline = Pipeline([
        ('classifier', GradientBoostingClassifier(random_state=42))
    ])

    # Hyperparameter grid for RandomizedSearchCV
    params = {
        'classifier__n_estimators': [100, 200, 300],  # Number of boosting stages
        'classifier__learning_rate': [0.01, 0.1, 0.2],  # Step size shrinking
        'classifier__max_depth': [3, 5, 7],  # Depth of each tree
        'classifier__subsample': [0.8, 1.0],  # Fraction of samples used for fitting trees
        'classifier__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
        'classifier__max_features': ['auto', 'sqrt', 'log2']  # Features to consider for splits
    }

    # RandomizedSearchCV with Stratified K-Folds cross-validation
    rs = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=10, scoring='roc_auc', cv=StratifiedKFold(n_splits=10), n_jobs=-1, verbose=20, random_state=42)

    # Fit the model using RandomizedSearchCV
    rs.fit(X_train, y_train)

    # Validate the model on the validation set
    y_pred_valid = rs.predict_proba(X_valid)[:, 1]
    roc_auc = roc_auc_score(y_valid, y_pred_valid)
    print(f"Validation ROC AUC: {roc_auc}")

    # Predict on the test set and prepare submission
    submissions = pd.DataFrame({'Junk': rs.predict_proba(x_test)[:, 1]})

    # Save the submission to a CSV file
    submissions.to_csv("project_5 submission.csv", index=False)
    print("submission file created:project_5 submission.csv")



No missing values in y_train.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Validation ROC AUC: 0.7656343898070033
submission file created:project_5 submission.csv
