Imports


In [1]:
import os
os.environ['KERAS_BACKEND'] = 'torch'
import optuna
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_regression, VarianceThreshold
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import legacy
from keras.regularizers import l1, l2, l1_l2
from keras.callbacks import EarlyStopping
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb

Data Read

In [16]:
train_data = pd.read_csv(r"/Users/talalkhan/Documents/Data Sets/Second Challange/train.csv")
test_data = pd.read_csv(r"/Users/talalkhan/Documents/Data Sets/Second Challange/test.csv")
row_ids = test_data['row ID']
test_data = test_data.drop('row ID', axis=1)

Drop sub_area

In [17]:
# Drop 'sub_area' column
train_data = train_data.drop('sub_area', axis=1)
test_data = test_data.drop('sub_area', axis=1)

Numerical & Categorical Columns distributed

In [18]:
# Separate categorical and numerical columns
categorical_columns = train_data.select_dtypes(include=['object']).columns
numerical_columns = train_data.select_dtypes(exclude=['object']).columns


OneHotEncoding

In [None]:
# Apply one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
train_data_encoded = pd.DataFrame(encoder.fit_transform(train_data[categorical_columns]))
train_data_encoded.columns = train_data_encoded.columns.astype(str)
train_data = pd.concat([train_data, train_data_encoded], axis=1)
train_data = train_data.drop(categorical_columns, axis=1)

# Apply the same transformation to the test set
test_data_encoded = pd.DataFrame(encoder.transform(test_data[categorical_columns]))
test_data_encoded.columns = test_data_encoded.columns.astype(str)
test_data = pd.concat([test_data, test_data_encoded], axis=1)
test_data = test_data.drop(categorical_columns, axis=1)


Label Encode

In [19]:
#Apply Label Encoding to the categorical columns
label_encoder = LabelEncoder()
for column in categorical_columns:
    train_data[column] = label_encoder.fit_transform(train_data[column])
    test_data[column] = label_encoder.transform(test_data[column])


Prep the data

In [20]:
# Prepare the data
X = train_data.drop(columns=['price_doc'])
y = train_data['price_doc']
X_test = test_data

# Convert column names to strings
X.columns = X.columns.astype(str)


Scaling / Normalization

In [21]:
# Apply Standard Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)


In [22]:
#Apply MinMaxScale
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

Feature Select

In [None]:
# Variance based Filter
variance_selector = VarianceThreshold(threshold=0.005)
X = variance_selector.fit_transform(X)
X_test = variance_selector.transform(X_test)


In [None]:
# Apply PCA
pca = PCA(n_components=0.8)
X = pca.fit_transform(X)
X_test = pca.transform(X_test)


In [None]:
# Add a constant column to X and X_test
X = sm.add_constant(X)
X_test = sm.add_constant(X_test)

# Fit the ordinary least squares (OLS) model
model = sm.OLS(y, X)
results = model.fit()

# Get the p-values for each feature
p_values = results.pvalues

# Select features with p-value less than 0.05 (or any desired threshold)
selected_features = p_values[p_values < 0.05].index

# Filter X and X_test based on selected features
X = X[selected_features]
X_test = X_test[selected_features]



In [None]:
# Forward Feature Selection
model_lasso = Lasso(alpha=0.1)
sfs = SequentialFeatureSelector(model_lasso, n_features_to_select=10, direction='forward', scoring='neg_mean_squared_error', cv=5)
X_forward = sfs.fit_transform(X, y)
X_test_forward = sfs.transform(X_test)

# Select final features after forward selection
selected_features_forward = [f'PC_{i+1}' for i in sfs.get_support(indices=True)]
X = pd.DataFrame(X_forward, columns=selected_features_forward)
X_test = pd.DataFrame(X_test_forward, columns=selected_features_forward)

In [None]:
# Forward Feature Selection
selector = SelectKBest(score_func=f_regression, k=10)
X_forward = selector.fit_transform(X, y)
X_test_forward = selector.transform(X_test)

# Select final features after forward selection
selected_features_forward = [f'PC_{i+1}' for i in selector.get_support(indices=True)]

X = pd.DataFrame(X_forward, columns=selected_features_forward)
X_test = pd.DataFrame(X_test_forward, columns=selected_features_forward)

TrainTestSplit

In [23]:
# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

Optuna for CatBoost

In [None]:
def objective(trial):
    # Hyperparameters to be tuned
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        #'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait': trial.suggest_int('od_wait', 10, 50)
    }

    # Create and fit the model
    model = CatBoostRegressor(**params, verbose=0)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=30)

    # Predict and calculate RMSE
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))

    return rmse


Optuna for XgBoost

In [None]:
def objective(trial):
    # Define hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 10, 100)

    }

    # Create and train the XGBoost model
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],verbose=True)

    # Predict and calculate RMSE
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    
    return rmse


Optuna for GradientBoost

In [None]:
def objective(trial):
    # Hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 14),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 14),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    }

    # Create and fit the model
    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)

    # Predict and calculate RMSE
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))

    return rmse


Optuna for LGB

In [None]:
def objective(trial):
    # Defining the hyperparameters to tune
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 10, 70)
    }

    # Training the model
    model_CV = lgb.LGBMRegressor(**params)
    model_CV.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    # Predicting and calculating RMSE
    preds = model_CV.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))

    return rmse


Optuna for RandomForesstRegessor

In [None]:
def objective(trial):
    # Define the hyperparameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 50),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # Create and train the RandomForestRegressor
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

    # Predict and calculate RMSE
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))

    return rmse


Start Optuna Study

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Get best parameters
best_params = study.best_params
print('Best parameters:', best_params)


Models

In [None]:
#apply neural network
model = Sequential()
#input layer
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.01)))
#hidden layers
model.add(Dense(128, activation='relu', kernel_regularizer=l1(0.001)))
#model.add(Dropout(0.2))
model.add(Dense(100, activation='relu', kernel_regularizer=l1(0.001)))
#model.add(Dropout(0.2))
model.add(Dense(80, activation='sigmoid', kernel_regularizer=l1(0.001)))
# #model.add(Dropout(0.2))
model.add(Dense(60, activation='relu', kernel_regularizer=l1(0.001)))
# # #model.add(Dropout(0.2))
model.add(Dense(40, activation='relu', kernel_regularizer=l1(0.001)))
#output layer
model.add(Dense(1, activation='linear') )
#model.add(Dense(1, activation='linear'))

opt = legacy.Adam(learning_rate=0.001)
#opt = RMSprop(learning_rate=0.005)
#opt = legacy.Adagrad(learning_rate=0.01)
#opt = legacy.Adamax(learning_rate=0.001)
#opt = legacy.SGD(learning_rate=0.001)

model.compile(loss='mean_squared_error', optimizer=opt , metrics='mse')

#apply early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

print(model.summary())

GradientBoost

In [24]:
model = GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.2, min_samples_split=4, min_samples_leaf=3, max_features='sqrt')

model.fit(X_train, y_train)

XGB

In [None]:
#apply XGB boost
# model = xgb.XGBRegressor(
#     objective='reg:squarederror',
#     n_estimators=300,
#     learning_rate=0.1,
#     max_depth=10,
#     subsample=0.6,
#     colsample_bytree=0.9,
#     early_stopping_rounds = 50
# )
model = xgb.XGBRegressor(**best_params)

#model = xgb.XGBRegressor()

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)


CatBoost

In [None]:
#apply catboost regressor
model = CatBoostRegressor(
    iterations=20_00,
    learning_rate=0.03,
    depth=7,
    l2_leaf_reg=3,
    loss_function='RMSE',
    eval_metric='RMSE',
    early_stopping_rounds=50
)


#model = CatBoostRegressor(**best_params, verbose=0)

#model = CatBoostRegressor(depth=)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=True,  # Output the training progress every 100 iterations
    use_best_model=True
)

LGB

In [None]:
#apply lightgbm regressor
'''model = lgb.LGBMRegressor(
    num_leaves=31, 
    max_depth=-5, 
    learning_rate=0.1, 
    n_estimators=200, 
    min_data_in_leaf=20, 
    feature_fraction=0.8,
    bagging_fraction=0.8, 
    bagging_freq=5, 
    lambda_l1=0.3, 
    lambda_l2=0.3
)'''

model = lgb.LGBMRegressor()


model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

In [11]:
# Train a model (Random Forest Regressor in this example)
model = RandomForestRegressor(n_estimators=1500, 
                              #max_depth=10, 
                              min_samples_split=4, 
                              min_samples_leaf=3, 
                              max_leaf_nodes=None,
                              max_features='sqrt',
                              min_impurity_decrease=0.01,
                              ccp_alpha=0,
                              oob_score=True, 
                              bootstrap=True, 
                              random_state=42,
                              n_jobs=-1
                              )

model.fit(X_train,y_train)

Model Fit for NN

In [None]:
#fit for nn
model.fit(X_train, y_train, epochs=100 , batch_size=512, validation_data=(X_train,y_train), callbacks=[early_stopping] , verbose=1)

Test on validation set

In [25]:
# Make predictions on the validation set
val_preds = model.predict(X_val)

# Evaluate the model
val_rmse = mean_squared_error(y_val, val_preds, squared=False)
print(f'Validation RMSE: {val_rmse}')

#lowest = 12570594.985033065
#RFlowest = 12552307.318903735 | 1300 iter

Validation RMSE: 12931559.271809414


Final Predict

In [26]:
test_preds = model.predict(X_test)


Flatten Values if NN used

In [None]:
test_preds = test_preds.flatten()

Make the .csv file

In [27]:
# Combine test row IDs with their corresponding predictions into a DataFrame
output = pd.DataFrame({'row ID': row_ids, 'price_doc': test_preds})

# Output the DataFrame to a CSV file
output.to_csv('submission143_25253.csv', index=False)
        