In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV

In [2]:
# Load the data
train_data_path = '../data/clean/train_data.csv'
test_data_path = '../data/clean/test_data.csv'
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [3]:
# Assuming 'reference' is the label column and all other columns are features
label_col = 'reference'

In [4]:
# Define features and labels
X_train = train_df.drop(columns=[label_col])
y_train = train_df[label_col]
X_test = test_df.drop(columns=[label_col])
y_test = test_df[label_col]

In [5]:
X_train, X_validate, y_train, y_validate = train_test_split(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), test_size=0.2, random_state=42)

In [6]:
# Standardize the feature columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
param_grid = {
    'n_estimators': [5, 20,50],
    'max_depth': [1,2, None],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [15]:
# Initialize RandomForestClassifier
rf = RandomForestClassifier()

In [10]:
# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=3)
grid_search.fit(X_train, y_train)



In [11]:
# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 20}
Best cross-validation score: 0.1023


In [12]:
# Train the RandomForestClassifier with the best parameters
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

In [13]:
# Make predictions on the test set
y_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.9664


In [16]:
# Initialize variables to store the best parameters and best score
best_params = None
best_score = 0

In [19]:
# Iterate over the parameter grid manually
for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            for min_samples_leaf in param_grid['min_samples_leaf']:
                for bootstrap in param_grid['bootstrap']:
                    # Initialize RandomForestClassifier with the current set of parameters
                    rf = RandomForestClassifier(
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        bootstrap=bootstrap,
                        random_state=42
                    )
                    
                    # Train the model on the full training set
                    rf.fit(X_train, y_train)
                    
                    # Make predictions on the test set
                    y_pred = rf.predict(X_test)
                    
                    # Evaluate the model
                    accuracy = accuracy_score(y_test, y_pred)
                    
                    # Update the best parameters and best score if the current model is better
                    if accuracy > best_score:
                        best_score = accuracy
                        best_params = {
                            'n_estimators': n_estimators,
                            'max_depth': max_depth,
                            'min_samples_split': min_samples_split,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap
                        }


In [20]:
# Print the best parameters and the best score
print(f"Best parameters: {best_params}")
print(f"Best test set accuracy: {best_score:.4f}")

Best parameters: {'n_estimators': 5, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}
Best test set accuracy: 0.9664


In [21]:
# Train the RandomForestClassifier with the best parameters on the full training set
best_rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    bootstrap=best_params['bootstrap'],
    random_state=42
)
best_rf.fit(X_train, y_train)

In [22]:
# Make predictions on the test set
y_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Final test set accuracy: {accuracy:.4f}")

Final test set accuracy: 0.9664
