In [None]:
pip install catboost


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
# Load the data
df = pd.read_csv('train.csv')

In [None]:
# Get the dimensions of the dataframe
print(f"The dataset contains {df.shape[0]} rows and {df.shape[1]} columns")


In [None]:
# Get an overview of the dataframe
print(df.info())

In [None]:
# Checking missing values
missing_cols = df.columns[df.isnull().sum() > 0]
print(f"Columns with missing values: {missing_cols}")


In [None]:
# Impute missing values with backward fill and forward fill methods
df.fillna(method='bfill',inplace=True)
df.fillna(method='ffill',inplace=True)


In [None]:
# Confirm that there are no more missing values
assert df.isnull().sum().sum() == 0, "There are still missing values in the dataframe"


In [None]:
# Check the data types of the columns
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
categorical_columns = set(df.columns).difference(set(numeric_columns))
print(f"Numeric columns: {numeric_columns}")
print(f"Categorical columns: {categorical_columns}")

In [None]:
# Convert 'edjefe' and 'edjefa' columns to numeric
df['edjefe'] = df['edjefe'].replace({'no': 0, 'yes':1}).astype(float)
df['edjefa'] = df['edjefa'].replace({'no': 0, 'yes':1}).astype(float)


In [None]:
# Recalculate the 'dependency' column
df['dependency'] = np.sqrt(df['SQBdependency'])

In [None]:
# Drop unnecessary columns
col_drops = ['Id','idhogar']
df.drop(col_drops,axis=1,inplace=True)


In [None]:
# Summary statistics for numeric columns
print(df.describe())


In [None]:
# Checking for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")


In [None]:
# Plotting the target variable to understand its distribution
sns.countplot(df['Target'])
plt.title('Target Variable Distribution')
plt.show()

In [None]:
df.Target.value_counts()

In [None]:
# Checking correlation between numerical features
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
# Create additional features

# Ensure we avoid division by zero by replacing zeros in denominators with a small constant.
eps = 1e-8

# Calculate ratios and per-person features
df['rent_per_adult'] = df['v2a1'] / (df['hogar_adul'] + eps)
df['rent_per_person'] = df['v2a1'] / (df['hhsize'] + eps)
df['overcrowding_room_and_bedroom'] = (df['hacdor'] + df['hacapo']) / 2
df['no_appliances'] = df['refrig'] + df['computer'] + df['television']
df['r4h1_percent_in_male'] = df['r4h1'] / (df['r4h3'] + eps)
df['r4m1_percent_in_female'] = df['r4m1'] / (df['r4m3'] + eps)
df['r4h1_percent_in_total'] = df['r4h1'] / (df['hhsize'] + eps)
df['r4m1_percent_in_total'] = df['r4m1'] / (df['hhsize'] + eps)
df['r4t1_percent_in_total'] = df['r4t1'] / (df['hhsize'] + eps)

# Calculate new features by subtracting & adding related features
df['adult'] = df['hogar_adul'] - df['hogar_mayor']
df['dependency_count'] = df['hogar_nin'] + df['hogar_mayor']
df['dependency'] = df['dependency_count'] / (df['adult'] + eps)
df['child_percent'] = df['hogar_nin'] / (df['hogar_total'] + eps)
df['elder_percent'] = df['hogar_mayor'] / (df['hogar_total'] + eps)
df['adult_percent'] = df['hogar_adul'] / (df['hogar_total'] + eps)

# Calculate features by comparing different household attributes
df['rent_per_bedroom'] = df['v2a1'] / (df['bedrooms'] + eps)
df['adults_per_bedroom'] = df['adult'] / (df['bedrooms'] + eps)
df['child_per_bedroom'] = df['hogar_nin'] / (df['bedrooms'] + eps)
df['male_per_bedroom'] = df['r4h3'] / (df['bedrooms'] + eps)
df['female_per_bedroom'] = df['r4m3'] / (df['bedrooms'] + eps)
df['bedrooms_per_person_household'] = df['hhsize'] / (df['bedrooms'] + eps)
df['tablet_per_person_household'] = df['v18q1'] / (df['hhsize'] + eps)
df['phone_per_person_household'] = df['qmobilephone'] / (df['hhsize'] + eps)
df['age_12_19'] = df['hogar_nin'] - df['r4t1']
df['rent_per_room'] = df['v2a1'] / (df['rooms'] + eps)
df['bedroom_per_room'] = df['bedrooms'] / (df['rooms'] + eps)
df['elder_per_room'] = df['hogar_mayor'] / (df['rooms'] + eps)
df['adults_per_room'] = df['adult'] / (df['rooms'] + eps)
df['child_per_room'] = df['hogar_nin'] / (df['rooms'] + eps)
df['male_per_room'] = df['r4h3'] / (df['rooms'] + eps)
df['female_per_room'] = df['r4m3'] / (df['rooms'] + eps)
df['room_per_person_household'] = df['hhsize'] / (df['rooms'] + eps)

# Calculate ratios for years of schooling and schooling relative to age
df['escolari_age'] = df['escolari'] / (df['age'] + eps)
df['rez_esc_escolari'] = df['rez_esc'] / (df['escolari'] + eps)
df['rez_esc_r4t1'] = df['rez_esc'] / (df['r4t1'] + eps)
df['rez_esc_r4t2'] = df['rez_esc'] / (df['r4t2'] + eps)
df['rez_esc_r4t3'] = df['rez_esc'] / (df['r4t3'] + eps)
df['rez_esc_age'] = df['rez_esc'] / (df['age'] + eps)

# Remove ID variables
# df.drop(columns=['Id', 'idhogar'], inplace=True)


In [None]:

# In[ ]:


from lightgbm import LGBMClassifier


# In[ ]:


from xgboost import XGBClassifier


from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization





x_1 = df.drop('Target',axis=1)


# In[ ]:


df.shape


# In[ ]:


x_1.shape


# In[ ]:


y = df['Target']


# In[ ]:




# In[ ]:


from catboost import CatBoostClassifier


# In[ ]:


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_validate,cross_val_score


# ## Optimizing Xgboost

# In[ ]:


def xgb_cv(n_estimators, max_depth, gamma, subsample, data, targets):
    estimator = XGBClassifier(
        n_estimators=n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        # min_child_weight=min_child_weight,
        subsample = subsample,
        random_state = 2,
    )
    cval = cross_val_score(estimator, data, targets
                          , cv=5)
    return cval.mean()


# In[ ]:


def optimize_xgb(data, targets):
    def xgb_crossval(n_estimators, max_depth, gamma, subsample):
        return xgb_cv(
            n_estimators=int(n_estimators),
            max_depth = int(max_depth),
            gamma = gamma,
            # min_child_weight=min_child_weight,
            subsample=subsample,
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=xgb_crossval,
        pbounds={
            "n_estimators": (100, 500),
            "max_depth": (6,15),
            "gamma": (0,10),
            # "min_child_weight": (0,10),
            "subsample": (0.8,1.0)
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=10)

    print("Final result:", optimizer.max)


# In[ ]:


print("--- Optimizing XGBoost ---")
optimize_xgb(x_1, y)


# ## Optimizing Catboost

# In[ ]:


def cb_cv(n_estimators, depth,data, targets):
    estimator = CatBoostClassifier(
        n_estimators=n_estimators,
#         learning_rate=learning_rate,
        depth=depth,
        random_state = 2,
        verbose = 0,
    )
    cval = cross_val_score(estimator, data, targets,
                            cv=5)
    return cval.mean()


# In[ ]:


def optimize_cb(data, targets):
    def cb_crossval(n_estimators, depth):
        return cb_cv(
            n_estimators=int(n_estimators),
#             learning_rate = learning_rate,
            depth = int(depth),
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=cb_crossval,
        pbounds={
            "n_estimators": (200, 600),
#             "learning_rate": (0.01,10),
            "depth": (4,16),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=20)

    print("Final result:", optimizer.max)


# In[ ]:


print("--- Optimizing Catboost ---")
optimize_cb(x_1, y)


In [None]:
def lgb_cv(n_estimators, num_leaves, min_child_samples, subsample, data, targets):
    estimator = LGBMClassifier(
        n_estimators=int(n_estimators),
        num_leaves=int(num_leaves),
        min_child_samples=int(min_child_samples),
        subsample=subsample,
        random_state=2
    )
    cval = cross_val_score(estimator, data, targets, cv=5, scoring='accuracy')
    return cval.mean()

def optimize_lgb(data, targets):
    def lgb_crossval(n_estimators, num_leaves, min_child_samples, subsample):
        return lgb_cv(
            n_estimators=n_estimators,
            num_leaves=num_leaves,
            min_child_samples=min_child_samples,
            subsample=subsample,
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=lgb_crossval,
        pbounds={
            "n_estimators": (100, 500),
            "num_leaves": (30, 80),
            "min_child_samples": (5, 30),
            "subsample": (0.6, 1.0)
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=20)

    print("Final result:", optimizer.max)

print("--- Optimizing LightGBM ---")
optimize_lgb(x_1, y)

# Model Fitting
Xg = XGBClassifier()
Lgbm = LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, num_leaves=200)
Cataboost = CatBoostClassifier(depth=9, n_estimators=514)

estimators = [('Cataboost', Cataboost), ('Xg', Xg), ('Lgbm', Lgbm)]
clf = StackingClassifier(estimators=estimators)

x_train, x_test, y_train, y_test = train_test_split(x_1, y, random_state=42)
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
