In [15]:
import dask.dataframe as dd
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = dd.read_csv('train_data.csv')
df_labels = dd.read_csv('train_labels.csv')

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

* Below function will preprocess the data, split the data in train and test, and finally train an Ensemble Random Forest

In [17]:
def preprocess(df,df_labels):
    
    missing_values = df.isnull().sum().compute()
    percentage_missing = (missing_values / df.shape[0].compute()) * 100
    columns_to_keep = percentage_missing[percentage_missing < 50].index
    
    df = df[columns_to_keep]
    
    df = dd.merge(df,df_labels,on='customer_ID')
    df = df.drop(columns=['customer_ID','S_2'])

    
    int_mean = df.select_dtypes(include=['int64']).mean().compute()
    float_median_approx = df.select_dtypes(include=['float64']).median_approximate().compute()
    float_median = float_median_approx
    mode = df.select_dtypes(include=['string[pyarrow]']).mode().compute().iloc[0]

    
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].fillna(int_mean[col])
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].fillna(float_median[col])
    for col in df.select_dtypes(include=['string[pyarrow]']).columns:
        df[col] = df[col].fillna(mode[col])


    df['D_63'] = df['D_63'].map(df['D_63'].value_counts(normalize=True))
    df['D_64'] = df['D_64'].map(df['D_64'].value_counts(normalize=True))

    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']).compute(),df['target'].compute(),
                                                        test_size=0.2,
                                                        random_state=42)

    model = RandomForestClassifier(n_estimators=10, n_jobs=-1)

    # Train the model with the initial number of trees
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(classification_report(y_test,y_pred))

In [18]:
import time
start_time = time.time()
preprocess(df, df_labels)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

              precision    recall  f1-score   support

           0       0.90      0.94      0.92    831388
           1       0.79      0.69      0.74    274903

    accuracy                           0.88   1106291
   macro avg       0.84      0.82      0.83   1106291
weighted avg       0.87      0.88      0.87   1106291

Elapsed time: 2223.3998806476593 seconds
