In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from preprocess import preprocess_text

In [5]:
df = pd.read_csv("../../data/jdCleanData/pst2_df.csv")
# Load a SpaCy model
nlp = spacy.load('en_core_web_lg')


Unnamed: 0,dsm,text,previous_column_number,str_len,is_ps,entity_encode,lag_text,lag_text2,lead_text,lead_text2
0,10000400,Position Summary:,3,17,1,10,line one,line two,The VP/Chief Financial Officer for Johns Hopki...,"Development, oversight and review of the budge..."
1,10000400,The VP/Chief Financial Officer for Johns Hopki...,4,557,1,10,Position Summary:,line one,"Development, oversight and review of the budge...",Internal and external financial analysis and r...
2,10000400,"Development, oversight and review of the budge...",5,118,1,10,The VP/Chief Financial Officer for Johns Hopki...,Position Summary:,Internal and external financial analysis and r...,Production and maintenance of the monthly fina...
3,10000400,Internal and external financial analysis and r...,6,55,1,10,"Development, oversight and review of the budge...",The VP/Chief Financial Officer for Johns Hopki...,Production and maintenance of the monthly fina...,Support the financial operations of JHI’s fore...
4,10000400,Production and maintenance of the monthly fina...,7,94,1,10,Internal and external financial analysis and r...,"Development, oversight and review of the budge...",Support the financial operations of JHI’s fore...,New and existing project / program / product d...


In [11]:
# Apply the function to the 'previous_line' column
columns_to_preprocess = ['text', 'lag_text','lag_text2', 'lead_text', 'lead_text2']

for column in columns_to_preprocess:
    df[column] = df[column].apply(lambda x:preprocess_text(x, nlp))

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# Define preprocessor
preprocessor_ps2 = ColumnTransformer(
    transformers=[
        ('text', CountVectorizer(), 'text'),
        ('lag_text', CountVectorizer(), 'lag_text'),
        ('lag_text2', CountVectorizer(), 'lag_text2'),
        ('lead_text', CountVectorizer(), 'lead_text'),
        ('lead_text2', CountVectorizer(), 'lead_text2'),
        ('num', StandardScaler(with_mean=False), ['str_len']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['entity_encode'])
    ]
)

X = df[["text", 'previous_column_number', 'str_len',
        "lag_text", 'lag_text2',
        'lead_text','lead_text2',
        'entity_encode']]

y = df['is_ps']

# Fit and transform the data
X = preprocessor_ps2.fit_transform(X)
from joblib import dump
dump(preprocessor_ps2, 'preprocessors/ps_processor.joblib')
print(df.dtypes)

dsm                        int64
text                      object
previous_column_number     int64
str_len                    int64
is_ps                      int64
entity_encode              int64
lag_text                  object
lag_text2                 object
lead_text                 object
lead_text2                object
dtype: object


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import GridSearchCV

# Defining base classifier
base_classifier = RandomForestClassifier()

# Wrap it with SelfTrainingClassifier
self_training_clf = SelfTrainingClassifier(base_classifier)

# Parameters for GridSearch. Notice the prefix to each parameter which corresponds to the base_classifier.
param_grid = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [None, 5, 10, 12],
    'base_estimator__min_samples_split': [2, 5, 10],
    'base_estimator__min_samples_leaf': [1, 2, 4],
    'base_estimator__bootstrap': [True, False]
}

# Setting up GridSearch with Cross-Validation
grid_search = GridSearchCV(estimator=self_training_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time= 8.5min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time= 4.1min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time= 8.3min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time= 4.1min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_spli



[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time= 4.3min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time=12.7min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=150; total time=12.5min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time=12.3min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time=11.1min
[CV] 

[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=5, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time=   4.8s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=5, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=100; total time=   9.7s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=5, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time=  18.9s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=5, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time=   5.0s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=5, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time=   9.9s
[CV] END base_estim

[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time= 8.6min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time= 4.1min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time= 8.2min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time= 4.2min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=100; total time= 8.3min
[CV] 

[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time= 8.4min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time= 4.3min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time= 8.3min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time= 4.1min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=100; total time= 8.3min
[CV] 

[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time= 4.2min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time=12.8min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=150; total time=12.4min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time=11.2min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time=11.1min
[CV] 

[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time= 8.5min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time= 4.2min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time= 8.4min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time= 4.1min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=100; total time= 8.3min
[CV] 

[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=10, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time=  54.8s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=10, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time=  28.5s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=10, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=10, base_estimator__n_estimators=100; total time=  55.5s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=10, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time=  55.6s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=10, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time=  27.0s
[CV] END base_e

[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time= 5.7min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time=11.2min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time= 5.7min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=10, base_estimator__n_estimators=100; total time=11.3min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time=   7.1s
[CV]

[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time= 7.2min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time=14.5min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time= 7.1min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=100; total time=14.4min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time= 6.4min
[

[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=150; total time=21.6min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time=21.4min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time=19.3min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=5, base_estimator__n_estimators=150; total time=19.2min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time=19.1mi

[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time=21.2min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time=19.6min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=5, base_estimator__n_estimators=150; total time=19.5min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time=19.2min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time=16.9mi

[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time=  10.7s
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time=  13.9s
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time=  22.0s
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time=   9.2s
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=5, base_estimator__n_estimators=50; total time=  11.1s
[CV] END base_esti

[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time=14.6min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=50; total time= 7.2min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=100; total time=14.3min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time= 6.4min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=None, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time=12.9min


In [27]:
# Extracting the best parameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

Best hyperparameters: {'base_estimator__bootstrap': False, 'base_estimator__max_depth': None, 'base_estimator__min_samples_leaf': 1, 'base_estimator__min_samples_split': 2, 'base_estimator__n_estimators': 150}


In [33]:
# Training the model with the best hyperparameters
model_for_eval = SelfTrainingClassifier(
    RandomForestClassifier(
        n_estimators=150,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        bootstrap=False
    )
)

model_for_eval.fit(X_train, y_train)


In [37]:
# Predicting with the model
y_pred = model_for_eval.predict(X_test)


from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred), "\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8259025724010937 

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      2902
           0       0.82      1.00      0.90     13807
           1       0.84      0.83      0.84      1212

    accuracy                           0.83     17921
   macro avg       0.56      0.61      0.58     17921
weighted avg       0.69      0.83      0.75     17921
 

Confusion Matrix:
[[    0  2727   175]
 [    0 13797    10]
 [    0   208  1004]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
#Use all data to train model

# Training the model with the best hyperparameters


best_model_ps2 = model_for_eval.fit(X, y)

In [39]:
from joblib import dump

# Save the model using joblib
dump(best_model_ps2, '../../joblib/best_model_ps2.joblib')

['../../joblib/best_model_ps2.joblib']