In [None]:


import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e10:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F84894%2F9709193%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241005%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241005T212300Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9755e8731818cf47b3a548758fa39cbf399df2e61493a088115c6640f18d683ef080ebf8a1153f9a42378787d7348c1ee894c3a962da97c7444b56e1faa34fe528fe9390807d7d615d80b6e1155edb4c815583689024f801039e8f9b063319f9a40e5c423064443e9a1474f317ba39af48e739038cf06edddfd5e6d2165d177fe993bb198f3b6d42a76b9fdabb3d2f6ea18d86a4c0fa1f4bdb95be1cc9f73cd70473fffd84955a775194a0629e502521d4edbff1ca0cecbbd563992daf3e0d7a155c079cd3dcc0b800f4c85a7bc96add1dffc676995a24dda1ad02bc3d47ee4a81fe3322dd89b0135839e8dd6140bf69c048662ad09e7f7c1639a7b13bdcff7c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading playground-series-s4e10, 1515882 bytes compressed
Downloaded and uncompressed: playground-series-s4e10
Data source import complete.


In [None]:


# Load the datasets
train_data = pd.read_csv("/kaggle/input/playground-series-s4e10/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e10/test.csv")

# Simple missing value handling: Drop columns with too many missing values
train_data.dropna(axis=1, thresh=len(train_data) * 0.8, inplace=True)
test_data.dropna(axis=1, thresh=len(test_data) * 0.8, inplace=True)

# Convert categorical variables to numeric using Label Encoding
label_encoders = {}
categorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

for column in categorical_columns:
    le = LabelEncoder()
    # Use .astype() to ensure proper type before encoding
    train_data[column] = le.fit_transform(train_data[column].astype(str))
    test_data[column] = le.transform(test_data[column].astype(str))
    label_encoders[column] = le  # Save the encoder if we need to inverse transform later

# Define features (X) and target (y) for training
X = train_data.drop(columns=['id', 'loan_status'])  # Assuming 'loan_status' is the target variable
y = train_data['loan_status']

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Classifier with fewer trees and a max depth limit
model = RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
print("Accuracy on validation set:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

# Make predictions on the test dataset
test_predictions = model.predict(test_data.drop(columns=['id']))


Accuracy on validation set: 0.9457754284252707

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     10087
           1       0.91      0.68      0.78      1642

    accuracy                           0.95     11729
   macro avg       0.93      0.83      0.87     11729
weighted avg       0.94      0.95      0.94     11729



In [None]:

# Convert the NumPy array to a DataFrame
test_predictions_df = pd.DataFrame(test_predictions, columns=['predictions'])

# Save the DataFrame to a CSV file
test_predictions_df.to_csv('test_predictions.csv', index=False)

print("Predictions saved to 'test_predictions.csv'.")


Predictions saved to 'test_predictions.csv'.
