In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

* Below function will preprocess the data, split the data in train and test, and finally train an Ensemble Random Forest

In [12]:
import pickle

In [18]:
def preprocess(df,df_labels):
    
    missing_values = df.isnull().sum()
    percentage_missing = (missing_values / df.shape[0]) * 100
    columns_to_keep = percentage_missing[percentage_missing < 50].index
    
    df = df[columns_to_keep]
    
    df = pd.merge(df,df_labels,on='customer_ID')
    df = df.drop(columns=['customer_ID','S_2'])

    
    int_mean = df.select_dtypes(include=['int64']).mean()
    float_median = df.select_dtypes(include=['float64']).median()
    mode = df.select_dtypes(include=['object']).mode().iloc[0]

    
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].fillna(int_mean[col])
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].fillna(float_median[col])
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna(mode[col])


    df['D_63'] = df['D_63'].map(df['D_63'].value_counts(normalize=True))
    df['D_64'] = df['D_64'].map(df['D_64'].value_counts(normalize=True))

    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']),df['target'],test_size=0.2,
                                                        random_state=42)

    model = RandomForestClassifier(n_estimators=10, warm_start=True, n_jobs=-1)

    # Train the model with the initial number of trees
    model.fit(X_train, y_train)
    
    # Increase the number of trees and continue training
    model.n_estimators += 5
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(classification_report(y_test,y_pred))

In [19]:
import time
df_labels = pd.read_csv('train_labels.csv')

start_time = time.time()
for chunk in pd.read_csv('train_data.csv', chunksize=100000):
    # Preprocess the chunk
    preprocess(chunk,df_labels)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

              precision    recall  f1-score   support

           0       0.94      0.94      0.94     14950
           1       0.83      0.83      0.83      5050

    accuracy                           0.91     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.91      0.91      0.91     20000

              precision    recall  f1-score   support

           0       0.94      0.95      0.94     14985
           1       0.84      0.82      0.83      5015

    accuracy                           0.91     20000
   macro avg       0.89      0.88      0.88     20000
weighted avg       0.91      0.91      0.91     20000

              precision    recall  f1-score   support

           0       0.94      0.96      0.95     15005
           1       0.86      0.81      0.84      4995

    accuracy                           0.92     20000
   macro avg       0.90      0.88      0.89     20000
weighted avg       0.92      0.92      0.92     20000

              preci