In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('train_cleaned.csv')

# Identify the categorical columns
categorical_columns = ['job', 'marital', 'education', 'default', 
                       'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

# Apply LabelEncoder automatically to each categorical column
label_encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Identify the feature columns (X) and the target column (y)
X = df.drop('y', axis=1)  # Feature columns
y = df['y']  # Target variable

# Split the data into training and testing sets (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print detailed evaluation metrics (precision, recall, f1-score)
print(classification_report(y_test, y_pred))

# If you want to check the feature importance
feature_importances = rf_model.feature_importances_
print("\nFeature Importances:")
for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance:.4f}")

Accuracy: 90.05%
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      7952
           1       0.64      0.41      0.50      1091

    accuracy                           0.90      9043
   macro avg       0.78      0.69      0.72      9043
weighted avg       0.89      0.90      0.89      9043


Feature Importances:
age: 0.1034
job: 0.0492
marital: 0.0226
education: 0.0279
default: 0.0020
balance: 0.1093
housing: 0.0260
loan: 0.0097
contact: 0.0201
day: 0.0897
month: 0.0877
duration: 0.2900
campaign: 0.0384
pdays: 0.0490
previous: 0.0228
poutcome: 0.0522


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Read the data
df = pd.read_csv('train_cleaned.csv')

# Define the categorical columns
categorical_columns = ['job', 'marital', 'education', 'default', 
                       'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

# Apply LabelEncoder automatically to each categorical column
label_encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Define the feature columns (X) and target column (y)
X = df.drop('y', axis=1)  # Feature columns
y = df['y']  # Target column

# Split the data into training and testing sets (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a pipeline for preprocessing and the model
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['int64']).columns)
    ])

# Create a pipeline with preprocessing and Random Forest model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameters for RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': [50, 100],  # Decrease the number of trees
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__bootstrap': [True, False]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=5, cv=3, random_state=42, n_jobs=-1)

# Train the model with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_search.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print detailed classification metrics (precision, recall, f1-score)
print(classification_report(y_test, y_pred))

# Cross-validation (Reduce the number of folds to 3)
cv_scores = cross_val_score(random_search.best_estimator_, X, y, cv=3, scoring='accuracy', n_jobs=-1)
print(f'Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%')

# Print the best parameters from RandomizedSearchCV
print(f"Best Parameters from RandomizedSearchCV: {random_search.best_params_}")

# If you want to check the feature importances
feature_importances = random_search.best_estimator_.named_steps['classifier'].feature_importances_
print("\nFeature Importances:")
for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance:.4f}")


Accuracy: 88.41%
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      7952
           1       0.58      0.14      0.23      1091

    accuracy                           0.88      9043
   macro avg       0.74      0.56      0.58      9043
weighted avg       0.86      0.88      0.85      9043

Cross-Validation Accuracy: 77.57%
Best Parameters from RandomizedSearchCV: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None, 'classifier__bootstrap': False}

Feature Importances:
age: 0.0324
job: 0.0278
marital: 0.0187
education: 0.1447
default: 0.0081
balance: 0.0219
housing: 0.0093
loan: 0.0001
contact: 0.0006
day: 0.0005
month: 0.0004
duration: 0.0013
campaign: 0.0007
pdays: 0.0010
previous: 0.0017
poutcome: 0.0014
