In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
nlp = spacy.load("en_core_web_lg")
df = pd.read_csv("../../data/jdCleanData/edut_df.csv")
df.head()

Unnamed: 0,dsm,text,previous_column_number,str_len,is_education,ed_count,max_ed_count,entity_encode,lag_text,lag_text2,lag_text3
0,11432511,Education:Minimum of a high school diploma or ...,2,59,True,1,1,11,first line,second line,third line
1,11432511,"Knowledge, Skills, and Abilities: Applicant mu...",3,1474,False,1,1,11,Education:Minimum of a high school diploma or ...,first line,second line
2,11432511,"Required Licensure, Certification, On-going Tr...",4,237,False,1,1,11,"Knowledge, Skills, and Abilities: Applicant mu...",Education:Minimum of a high school diploma or ...,first line
3,11432511,Work Experience: Minimum of one year’s experie...,5,78,False,1,1,11,"Required Licensure, Certification, On-going Tr...","Knowledge, Skills, and Abilities: Applicant mu...",Education:Minimum of a high school diploma or ...
4,11432511,"Machines, Tools, Equipment: Requires use of mu...",6,126,False,1,1,11,Work Experience: Minimum of one year’s experie...,"Required Licensure, Certification, On-going Tr...","Knowledge, Skills, and Abilities: Applicant mu..."


In [3]:
# Define a function to preprocess text

def preprocess_text(text):
    # Create a Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in STOP_WORDS]
    
    return ' '.join(a_lemmas)

In [4]:
columns_to_preprocess = ['text', "lag_text",'lag_text2', 'lag_text3']

for column in columns_to_preprocess:
    df[column] = df[column].apply(preprocess_text)


In [70]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text', CountVectorizer(), 'text'),
        ('lag_text', CountVectorizer(), 'lag_text'),
        ('lag_text2', CountVectorizer(), 'lag_text2'),
        ('lag_text3', CountVectorizer(), 'lag_text3'),
        ('num', StandardScaler(with_mean=False), ['str_len']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['entity_encode'])
    ]
)



X = df[["text", 'str_len', "lag_text", 'lag_text2', 'lag_text3',  'entity_encode']]
y = df['is_education']

# Fit and transform the data
X = preprocessor.fit_transform(X)

print(df.dtypes)


dsm                       object
text                      object
previous_column_number     int64
str_len                    int64
is_education                bool
ed_count                   int64
max_ed_count               int64
entity_encode             object
lag_text                  object
lag_text2                 object
lag_text3                 object
dtype: object


In [71]:
from joblib import dump
dump(preprocessor, 'path_to_save_preprocessor.joblib')


['path_to_save_preprocessor.joblib']

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
clf = RandomForestClassifier()

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Setup GridSearch with Cross-Validation then fit
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [63]:

#Get the best parameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)


print(best_params['bootstrap'])

Best hyperparameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
False


In [64]:
# Creating a new instance with the best hyperparameters
best_rf_clf = RandomForestClassifier(**best_params)


In [65]:
model_for_eval = best_rf_clf.fit(X_train, y_train)

In [66]:
# Evaluate on test data
# best_clf = grid_search.best_estimator_
# y_pred = best_clf.predict(X_test)
y_pred = model_for_eval.predict(X_test)


from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred), "\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9948024948024948 

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      1858
        True       0.94      0.91      0.92        66

    accuracy                           0.99      1924
   macro avg       0.97      0.95      0.96      1924
weighted avg       0.99      0.99      0.99      1924
 

Confusion Matrix:
[[1854    4]
 [   6   60]]


In [53]:
#After evaluation, fit to all the data
final_model =best_rf_clf.fit(X, y)

In [54]:
from joblib import dump

# Save the trained model
dump(final_model, '../../joblib/education_rfm.joblib')




['../../joblib/education_rfm.joblib']