Import libraries and download necessary packages + load dataset for training model

In [1]:
# Packages to work with datasets for training classification
import pandas as pd
import numpy as np

# Packages to pre-process data
import re
import string
import nltk  #Natural Language Tool Kit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Package to embed the datapoints
from gensim.models import KeyedVectors

# Loading packages and models for classification task
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D, Embedding, GlobalMaxPooling1D
from keras.models import clone_model

# Load the data
df = pd.read_excel('privacy2_modified.xlsx')

Preprocessing dataset

In [2]:
# Download necessary NLTK datasets for pre-processing
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Loading necessary tools for pre-processing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# Pre-proccessing
def preprocess_policy(policy):
    policy = policy.lower() # Lower case the datapoint
    policy = re.sub('[%s]' % re.escape(string.punctuation), '', policy) # Remove special characters
    policy = re.sub('\w*\d\w*', '', policy) # Remove unmeaning words such as 123, a1b
    tokens = word_tokenize(policy)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    # tokens = [stemmer.stem(token) for token in tokens] #No need Stemming due to policy's nature
    # Stemming in this case can lead to over stemming
    return tokens

# Apply "preprocess_policy()" for the training datasets
df['tokens'] = df['text'].apply(preprocess_policy)

[nltk_data] Downloading package punkt to /Users/macos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/macos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/macos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Feature Extraction

In [3]:
# Load Google News Word2Vec Model - Word embedding method
embedding_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Create Vector Representations for Policies
def get_average_word2vec(preprocessed_datapoint, w2v_model, generate_missing=False, k=300):
    if len(preprocessed_datapoint)<1:
        return np.zeros(k)
    
    # Assign vector value if token is not in model: depends on generate_missing, = 0 in this case
    if generate_missing:
        vectorized = [w2v_model[token] if token in w2v_model else np.random.rand(k) for token in preprocessed_datapoint]
    else:
        vectorized = [w2v_model[token] if token in w2v_model else np.zeros(k) for token in preprocessed_datapoint]
    
    # Calculate the average vector of the datapoint 
    # by dividing sum of values in same axis to the number of token in a datapoint
    length_datapoint = len(vectorized)
    summed_vector = np.sum(vectorized, axis=0)
    averaged_vector = np.divide(summed_vector, length_datapoint)

    return averaged_vector

def get_word2vec_embeddings(model, data, generate_missing=False):
    embeddings = data['tokens'].apply(lambda x: get_average_word2vec(x, model, generate_missing=generate_missing))
    return list(embeddings)


w2v_data = get_word2vec_embeddings(embedding_model, df)


Training model to classify - split train,test dataset

In [4]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(w2v_data, df['type'], test_size=0.2, random_state=38)


Random Forest

In [5]:
# Train Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate the Model
predictions = clf.predict(X_test)
train_predictions = clf.predict(X_train)
print("Train Accuracy: ", accuracy_score(y_train, train_predictions))
print("Accuracy: ", accuracy_score(y_test, predictions))

print("\nClassification Report:\n", classification_report(y_test, predictions))

Train Accuracy:  1.0
Accuracy:  0.9124087591240876

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.82      0.88       110
           1       0.89      0.98      0.93       164

    accuracy                           0.91       274
   macro avg       0.92      0.90      0.91       274
weighted avg       0.92      0.91      0.91       274



SVM


In [6]:
# SVM Classifier
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

svm_predictions = svm_clf.predict(X_test)
train_predictions = svm_clf.predict(X_train)
print("SVM Accuracy: ", accuracy_score(y_test, svm_predictions))
print("Train Accuracy: ", accuracy_score(y_train, train_predictions))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))

SVM Accuracy:  0.9014598540145985
Train Accuracy:  0.9095063985374772

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.77      0.86       110
           1       0.87      0.99      0.92       164

    accuracy                           0.90       274
   macro avg       0.92      0.88      0.89       274
weighted avg       0.91      0.90      0.90       274



Logistic Regression


In [7]:
# Logistic Regression Classifier
logistic_clf = LogisticRegression() 
logistic_clf.fit(X_train, y_train)

logistic_predictions = logistic_clf.predict(X_test)
train_predictions = logistic_clf.predict(X_train)
print("Train Accuracy: ", accuracy_score(y_train, train_predictions))
print("Logistic Regression Accuracy: ", accuracy_score(y_test, logistic_predictions))
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, logistic_predictions))


Train Accuracy:  0.8857404021937842
Logistic Regression Accuracy:  0.9014598540145985

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.78      0.86       110
           1       0.87      0.98      0.92       164

    accuracy                           0.90       274
   macro avg       0.92      0.88      0.89       274
weighted avg       0.91      0.90      0.90       274



In [8]:
# Train Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

dt_predictions = dt.predict(X_test)
train_predictions = dt.predict(X_train)
print("Train Accuracy: ", accuracy_score(y_train, train_predictions))
print("Decision Tree Accuracy: ", accuracy_score(y_test, dt_predictions))
print("\nClassification Report:\n", classification_report(y_test, dt_predictions))


Train Accuracy:  1.0
Decision Tree Accuracy:  0.843065693430657

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.75      0.79       110
           1       0.85      0.90      0.87       164

    accuracy                           0.84       274
   macro avg       0.84      0.83      0.83       274
weighted avg       0.84      0.84      0.84       274



CNN


In [9]:
# Reshape data for CNN
X_train_cnn = np.array(X_train).reshape(len(X_train), 300, 1)
X_test_cnn = np.array(X_test).reshape(len(X_test), 300, 1)

# Build the CNN model
cnn_model = Sequential()
cnn_model.add(Conv1D(128, 5, activation='relu', input_shape=(300, 1)))
cnn_model.add(MaxPooling1D(5))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid')) # Assuming binary classification

cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), epochs=5, batch_size=32)

# Evaluate the CNN model
cnn_predictions = (cnn_model.predict(X_test_cnn) > 0.5).astype("int32").flatten()
train_predictions = (cnn_model.predict(X_train_cnn) > 0.5).astype("int32").flatten()
print("Train Accuracy: ", accuracy_score(y_train, train_predictions))
print("CNN Accuracy: ", accuracy_score(y_test, cnn_predictions))
print("\nCNN Classification Report:\n", classification_report(y_test, cnn_predictions))

Epoch 1/5


2023-10-25 00:25:38.225834: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train Accuracy:  0.8180987202925045
CNN Accuracy:  0.8065693430656934

CNN Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.79      0.77       110
           1       0.85      0.82      0.83       164

    accuracy                           0.81       274
   macro avg       0.80      0.80      0.80       274
weighted avg       0.81      0.81      0.81       274



Cross validation: 1st time

In [10]:
def train_and_evaluate_cnn(X_train, y_train, X_val, y_val):
    model = clone_model(cnn_model)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0)
    _, accuracy = model.evaluate(X_val, y_val, verbose=0)
    return accuracy

# Simulate k-fold cross-validation using 5 splits
cnn_accuracies = []
for _ in range(5):
    X_train_cnn, X_val_cnn, y_train_cnn, y_val_cnn = train_test_split(np.array(w2v_data).reshape(len(w2v_data), 300, 1), df['type'], test_size=0.2)
    acc = train_and_evaluate_cnn(X_train_cnn, y_train_cnn, X_val_cnn, y_val_cnn)
    cnn_accuracies.append(acc)

# For Decision Tree
dt_scores = cross_val_score(DecisionTreeClassifier(), w2v_data, df['type'], cv=5, scoring="accuracy")
print(f"Decision Tree Cross-Validation Accuracy: {dt_scores.mean():.2f} (+/- {dt_scores.std() * 2:.2f})\n")

# For Random Forest
rf_scores = cross_val_score(RandomForestClassifier(), w2v_data, df['type'], cv=5, scoring="accuracy")

print(f"Random Forest Cross-Validation Accuracy: {rf_scores.mean():.2f} (+/- {rf_scores.std() * 2:.2f})\n")

# For SVM
svm_scores = cross_val_score(SVC(), w2v_data, df['type'], cv=5, scoring="accuracy")

print(f"SVM Cross-Validation Accuracy: {svm_scores.mean():.2f} (+/- {svm_scores.std() * 2:.2f})\n")

# For Logistic Regression
logistic_scores = cross_val_score(LogisticRegression(), w2v_data, df['type'], cv=5, scoring="accuracy")
print(f"Logistic Regression Cross-Validation Accuracy: {logistic_scores.mean():.2f} (+/- {logistic_scores.std() * 2:.2f})\n")

# For CNN
print(f"CNN Cross-Validation Accuracy: {np.mean(cnn_accuracies):.2f} (+/- {np.std(cnn_accuracies) * 2:.2f})")



Decision Tree Cross-Validation Accuracy: 0.81 (+/- 0.01)

Random Forest Cross-Validation Accuracy: 0.92 (+/- 0.03)

SVM Cross-Validation Accuracy: 0.88 (+/- 0.03)

Logistic Regression Cross-Validation Accuracy: 0.87 (+/- 0.05)

CNN Cross-Validation Accuracy: 0.82 (+/- 0.03)


Fine-tuning - Random Forest

In [11]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

best_rf = grid_search.best_estimator_
rf_predictions = best_rf.predict(X_test)
print("Random Forest Accuracy after Grid Search: ", accuracy_score(y_test, rf_predictions))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Random Forest Accuracy after Grid Search:  0.8941605839416058


Fine-tuning  - SVM

In [12]:
# Define a grid of hyperparameters
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']  # You can also add 'linear', 'poly', 'sigmoid' to test other kernels
}

# Use grid search with cross-validation
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_svm = grid_search.best_estimator_

# Check the best hyperparameters
print(grid_search.best_params_)

# Predict and check accuracy
svm_predictions = best_svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"SVM after Grid Search: {svm_accuracy}")

Fitting 5 folds for each of 64 candidates, totalling 320 fits
{'C': 100, 'gamma': 1, 'kernel': 'rbf'}
SVM after Grid Search: 0.8905109489051095


Fine-tuning - Logistic


In [14]:
param_grid_logistic = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['newton-cg','lbfgs', 'liblinear', 'sag', 'saga']
}
grid_search_logistic = GridSearchCV(LogisticRegression(max_iter=10000), param_grid_logistic, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search_logistic.fit(X_train, y_train)

# Get the best estimator
best_logistic = grid_search_logistic.best_estimator_

# Check the best hyperparameters
print(grid_search_logistic.best_params_)

# Predict and check accuracy
logistic_predictions = best_logistic.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
print(f"Logistic Regression Accuracy after Grid Search: {logistic_accuracy}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression Accuracy after Grid Search: 0.8905109489051095


Fine - tuning Decision Tree

In [15]:
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best Decision Tree Parameters: {best_params}")

best_dt = grid_search.best_estimator_
dt_predictions = best_dt.predict(X_test)
print("Decision Tree Accuracy after Grid Search: ", accuracy_score(y_test, dt_predictions))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Decision Tree Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Decision Tree Accuracy after Grid Search:  0.8357664233576643


In [16]:
# Train Decision Tree Classifier - {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=5,random_state=42)
dt.fit(X_train, y_train)

dt_predictions = dt.predict(X_test)
print("Decision Tree Accuracy: ", accuracy_score(y_test, dt_predictions))

# Random Forest - {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
clf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=1, min_samples_split=5, random_state=40)
clf.fit(X_train, y_train)

# Evaluate the Model
predictions = clf.predict(X_test)
print("Random Forest Accuracy: ", accuracy_score(y_test, predictions))

# Logistic Regression Classifier -  {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
logistic_clf = LogisticRegression(solver='newton-cg', max_iter=10000, C=10, penalty='l2') 
logistic_clf.fit(X_train, y_train)

logistic_predictions = logistic_clf.predict(X_test)
print("Logistic Regression Accuracy: ", accuracy_score(y_test, logistic_predictions))

# SVM Classifier - {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
svm_clf = SVC(kernel='rbf',gamma=1,  probability=True, C=10)
svm_clf.fit(X_train, y_train)

svm_predictions = svm_clf.predict(X_test)
print("SVM Accuracy: ", accuracy_score(y_test, svm_predictions))

# Reshape data for CNN
X_train_cnn = np.array(X_train).reshape(len(X_train), 300, 1)
X_test_cnn = np.array(X_test).reshape(len(X_test), 300, 1)

# Build the CNN model
cnn_model = Sequential()
cnn_model.add(Conv1D(128, 5, activation='relu', input_shape=(300, 1)))
cnn_model.add(MaxPooling1D(5))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid')) # Assuming binary classification

cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), epochs=5, batch_size=32)

# Evaluate the CNN model
cnn_predictions = (cnn_model.predict(X_test_cnn) > 0.5).astype("int32").flatten()
print("CNN Accuracy: ", accuracy_score(y_test, cnn_predictions))

Decision Tree Accuracy:  0.8175182481751825
Random Forest Accuracy:  0.916058394160584
Logistic Regression Accuracy:  0.8905109489051095
SVM Accuracy:  0.8941605839416058
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CNN Accuracy:  0.8686131386861314


Main Event - Classification

In [17]:
document_content = """Within the Smithsonian's Websites there are embedded applications, plug-ins, widgets or links to non-Smithsonian Websites (collectively "sites"). These sites operate independently of the Smithsonian and have their own privacy policies. When you visit these sites, you leave our Website and no longer will be subject to our privacy and security policies. The Smithsonian is not responsible for the privacy or security practices or the content of other sites, and such sites are not intended to be an endorsement of those sites or their content.

In addition to information collected automatically, we use cookies to support the internal functionality of the Websites. Cookies, which are small pieces of information sent to your browser by a website that you visit, are used to track usage patterns, traffic trends and customer behavior, as well as to record other information from the Website. When you register on the Websites, cookies also allow us to save information so that you will not have to re-enter it the next time you visit. Many content adjustments and customer service improvements are made based on the data derived from cookies. Some Smithsonian Websites use third-party vendors, such as Google Analytics, to place cookies and analyze the information collected by cookies in order to make the Smithsonian Websites more interesting and useful to you. No personally identifiable information is collected. Information that we collect from cookies will not be used to create profiles of individual users and only will be used in aggregate form. The data is retained for as long as necessary to support the mission of the Smithsonian Websites. You may set your browser to refuse cookies from any website that you visit. If you so choose, you may still gain access to most of the Smithsonian's Websites, but you may not be able to conduct certain types of transactions (such as shopping) or take advantage of some of the interactive elements offered. You may opt-out of Google Analytics using the Google Analytics Opt-out Browser Add-on.

The security of personally-identifiable information is important to us. We maintain administrative, technical and physical safeguards to protect against unauthorized use, disclosure, alteration or destruction of the personally-identifiable information we collect on this website. While we make every effort to help ensure the integrity and security of our network and systems, we cannot guarantee our security measures."""
problematic = []


def classify_policy(policy_text):
    # Preprocess the policy
    tokens = preprocess_policy(policy_text)
    embedding = get_average_word2vec(tokens, embedding_model)
    
    # # Reshape for CNN
    # cnn_input = np.array(embedding).reshape(1, 300, 1)
    # rf_prediction = clf.predict([embedding])[0]
    # # Predict using SVM
    svm_prediction = svm_clf.predict([embedding])[0]

    # # Predict using Logistic Regression
    # logistic_prediction = logistic_clf.predict([embedding])[0]

    # dt_predictions = dt.predict([embedding])[0]

    # Predict using CNN
    # cnn_prediction = (cnn_model.predict(cnn_input) > 0.5).astype("int32").flatten()[0]
    
    return svm_prediction
    
problematic = []

def split_into_paragraphs(document_content):
    # Normalize the line breaks
    normalized_content = document_content.replace('\r\n', '\n')
    # Split the document by double line breaks
    chunks = [p.strip() for p in normalized_content.split('\n\n') if p.strip()]
    
    paragraphs = []
    current_para = ""
    for chunk in chunks:
        # If the chunk starts with any list indicator, append it to the current paragraph
        if chunk.startswith(('•', '+', '-')):
            current_para += '\n' + chunk
        else:
            # If we have content in the current paragraph, store it and start a new one
            if current_para:
                paragraphs.append(current_para)
                current_para = ""
            current_para = chunk
    # Add any remaining content to the paragraphs list
    if current_para:
        paragraphs.append(current_para)
    
    return paragraphs

paragraphs = split_into_paragraphs(document_content)
for i, paragraph in enumerate(paragraphs, 1):
    print(paragraph)
    print("-----------")
    
for paragraph in paragraphs:
    predictions = classify_policy(paragraph)
    print(predictions)
    if predictions == 0:
        problematic.append(paragraph)

Within the Smithsonian's Websites there are embedded applications, plug-ins, widgets or links to non-Smithsonian Websites (collectively "sites"). These sites operate independently of the Smithsonian and have their own privacy policies. When you visit these sites, you leave our Website and no longer will be subject to our privacy and security policies. The Smithsonian is not responsible for the privacy or security practices or the content of other sites, and such sites are not intended to be an endorsement of those sites or their content.
-----------
In addition to information collected automatically, we use cookies to support the internal functionality of the Websites. Cookies, which are small pieces of information sent to your browser by a website that you visit, are used to track usage patterns, traffic trends and customer behavior, as well as to record other information from the Website. When you register on the Websites, cookies also allow us to save information so that you will no

Main Event - Highlight

In [18]:
from IPython.core.display import display, HTML

def highlight_problematic_sentences(term, problematic_phrases):
    if len(problematic_phrases) >= 1:
        for phrase in problematic_phrases:
            highlighted_phrase = f'<span style="background-color: #ff0000">{phrase}</span>'
            term = term.replace(phrase, highlighted_phrase)
        display(HTML(term))

highlight_problematic_sentences(document_content, problematic)

  from IPython.core.display import display, HTML


Main Event - Summarization

In [21]:
from transformers import pipeline
bart = pipeline("summarization", model="facebook/bart-large-cnn")
articles = problematic
summ = []

for article in articles:
  result = bart(article)
  summ.append(result)

summ

[]

In [22]:
from joblib import dump

# Assuming your model is stored in the variable 'dt'
dump(svm_clf, 'svm_model.pkl')


['svm_model.pkl']

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estim

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_esti

[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimator