In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from preprocess import preprocess_text

In [2]:
df= pd.read_csv("../../data/jdCleanData/pst_df.csv")
# Load a SpaCy model
nlp = spacy.load('en_core_web_lg')

Unnamed: 0,dsm,text,previous_column_number,str_len,is_ps,entity_encode,lag_text,lag_text2,lag_text3,lag_text4
0,11432511,SUMMARY,2,7,-1,11,first line,second line,third line,fourth line
1,11432511,This is a Bilingual Receptionist position with...,3,415,1,11,SUMMARY,first line,second line,third line
2,11432511,ESSENTIAL FUNCTIONS,4,19,0,11,This is a Bilingual Receptionist position with...,SUMMARY,first line,second line
3,11432511,Reasonable accommodations may be made to enabl...,5,103,0,11,ESSENTIAL FUNCTIONS,This is a Bilingual Receptionist position with...,SUMMARY,first line
4,11432511,Responsible for communication with non-English...,6,159,0,11,Reasonable accommodations may be made to enabl...,ESSENTIAL FUNCTIONS,This is a Bilingual Receptionist position with...,SUMMARY


In [6]:
# Apply the function to the 'previous_line' column
columns_to_preprocess = ['text', 'lag_text', 'lag_text2', 'lag_text3']

for column in columns_to_preprocess:
    df[column] = df[column].apply(lambda x:preprocess_text(x, nlp))


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text', CountVectorizer(), 'text'),
        ('lag_text', CountVectorizer(), 'lag_text'),
        ('lag_text2', CountVectorizer(), 'lag_text2'),
        ('lag_text3', CountVectorizer(), 'lag_text3'),
        ('num', StandardScaler(with_mean=False), ['str_len']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['entity_encode'])
    ]
)

X = df[["text", 'previous_column_number', 'str_len', "lag_text", 'lag_text2', 'lag_text3', 'entity_encode']]
y = df['is_ps']

# Fit and transform the data
X = preprocessor.fit_transform(X)

In [9]:
from joblib import dump
dump(preprocessor, 'preprocessors/ps_processor.joblib')
print(df.dtypes)

dsm                       object
text                      object
previous_column_number     int64
str_len                    int64
is_ps                      int64
entity_encode             object
lag_text                  object
lag_text2                 object
lag_text3                 object
lag_text4                 object
dtype: object


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

In [68]:
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier
class_weight = {0: 1, 1: 4}

In [69]:
from sklearn.model_selection import GridSearchCV


# define the parameter values that should be searched
param_grid = {'base_estimator__C': [0.1, 1, 10, 100, 1000], 
              'base_estimator__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'base_estimator__kernel': ['rbf'], 
              'base_estimator__class_weight': [class_weight]} 

base_classifier = SVC(probability=True)
self_training_model = SelfTrainingClassifier(base_classifier)

grid = GridSearchCV(self_training_model, param_grid, refit=True, verbose=3)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


KeyboardInterrupt: 

In [12]:
#Also runed
best_C = grid.best_params_['base_estimator__C']
best_gamma = grid.best_params_['base_estimator__gamma']
best_kernel = grid.best_params_['base_estimator__kernel']
print(best_C,best_gamma, best_kernel, class_weight)

NameError: name 'grid' is not defined

In [13]:
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier

# Set the best hyperparameters and set instance for base classifier
best_params = {'base_estimator__C': 100,
               'base_estimator__gamma': 0.001,
               'base_estimator__kernel': 'rbf',
               'base_estimator__class_weight': {0: 1, 1: 4}}

base_classifier = SVC(probability=True,
                      C=best_params['base_estimator__C'],
                      gamma=best_params['base_estimator__gamma'],
                      kernel=best_params['base_estimator__kernel'],
                      class_weight=best_params['base_estimator__class_weight'])

# Create a new instance of your self_training_model using your base_classifier
self_training_model_hp = SelfTrainingClassifier(base_classifier)


# Fit the model using your all labeled data to increase robustness of the model now that it has been tested
self_training_model_hp.fit(X_train, y_train)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


predictions = self_training_model_hp.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Accuracy Score:")
print(accuracy_score(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       462
           0       0.83      0.99      0.91      2275
           1       0.78      0.85      0.81       140

    accuracy                           0.83      2877
   macro avg       0.54      0.61      0.57      2877
weighted avg       0.69      0.83      0.76      2877

Confusion Matrix:
[[   0  440   22]
 [   0 2263   12]
 [   0   21  119]]
Accuracy Score:
0.8279457768508863


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
#After testing, use all data to generate a model


best_params = {'base_estimator__C': 100,
               'base_estimator__gamma': 0.001,
               'base_estimator__kernel': 'rbf',
               'base_estimator__class_weight': {0: 1, 1: 4}}

base_classifier = SVC(probability=True,
                      C=best_params['base_estimator__C'],
                      gamma=best_params['base_estimator__gamma'],
                      kernel=best_params['base_estimator__kernel'],
                      class_weight=best_params['base_estimator__class_weight'])



# Create a new instance of your self_training_model using your base_classifier
self_training_model_robust = SelfTrainingClassifier(base_classifier)


# Fit the model using your all labeled data to increase robustness of the model now that it has been tested
self_training_model_robust.fit(X, y)


In [16]:
from joblib import dump, load

# Save the model using joblib
dump(self_training_model_robust, '../../joblib/self_training_model_robust_ps.joblib')




['../../joblib/self_training_model_robust_ps.joblib']

In [90]:
import joblib
# 2. Prepare your new data
# This should be in the same format (e.g., same features, same preprocessing steps, etc.) as the training data
new_data = [...]  # Replace with your new data

# 3. Use the loaded model to make predictions
predictions = loaded_model_from_joblib.predict(X)

print(predictions)

prob_df = pd.DataFrame(probabilities, columns=self_training_model_hp.classes_)

prob_df

df_with_probs = pd.concat([df, prob_df], axis=1)

df_with_probs.head(n=20)


[0 1 0 ... 0 0 0]


Unnamed: 0,dsm,text,previous_column_number,str_len,is_ps,entity_encode,lag_text,lag_text2,lag_text3,lag_text4,0,1
0,11432511,SUMMARY,2,7,-1,11,first line,second line,third line,fourth line,0.997412,0.002588
1,11432511,This is a Bilingual Receptionist position with...,3,415,1,11,SUMMARY,first line,second line,third line,2e-06,0.999998
2,11432511,ESSENTIAL FUNCTIONS,4,19,0,11,This is a Bilingual Receptionist position with...,SUMMARY,first line,second line,0.988159,0.011841
3,11432511,Reasonable accommodations may be made to enabl...,5,103,0,11,ESSENTIAL FUNCTIONS,This is a Bilingual Receptionist position with...,SUMMARY,first line,0.996246,0.003754
4,11432511,Responsible for communication with non-English...,6,159,0,11,Reasonable accommodations may be made to enabl...,ESSENTIAL FUNCTIONS,This is a Bilingual Receptionist position with...,SUMMARY,0.988165,0.011835
5,11432511,Follows current protocol in place for opening ...,7,68,0,11,Responsible for communication with non-English...,Reasonable accommodations may be made to enabl...,ESSENTIAL FUNCTIONS,This is a Bilingual Receptionist position with...,0.997119,0.002881
6,11432511,"Greets patients in polite, prompt and helpful ...",8,53,0,11,Follows current protocol in place for opening ...,Responsible for communication with non-English...,Reasonable accommodations may be made to enabl...,ESSENTIAL FUNCTIONS,0.99823,0.00177
7,11432511,Provides necessary instructions/directions as ...,9,65,0,11,"Greets patients in polite, prompt and helpful ...",Follows current protocol in place for opening ...,Responsible for communication with non-English...,Reasonable accommodations may be made to enabl...,0.998656,0.001344
8,11432511,Informs appropriate department of patient’s ar...,10,52,0,11,Provides necessary instructions/directions as ...,"Greets patients in polite, prompt and helpful ...",Follows current protocol in place for opening ...,Responsible for communication with non-English...,0.996264,0.003736
9,11432511,Completes necessary paperwork such as encounte...,11,111,0,11,Informs appropriate department of patient’s ar...,Provides necessary instructions/directions as ...,"Greets patients in polite, prompt and helpful ...",Follows current protocol in place for opening ...,0.996896,0.003104


In [44]:
#This code exports 

# Get the class probabilities
probabilities = self_training_model_hp.predict_proba(X)

# Convert the probabilities to a dataframe
prob_df = pd.DataFrame(probabilities, columns=self_training_model_hp.classes_)



df_with_probs = pd.concat([df, prob_df], axis=1)

df_with_probs.head(n=20)

Unnamed: 0,dsm,text,previous_column_number,str_len,is_ps,entity_encode,lag_text,lag_text2,lag_text3,lag_text4,0,1
0,11432511,summary,2,7,-1,11,first line,second line,line,fourth line,0.997412,0.002588
1,11432511,bilingual Receptionist position add responsibi...,3,415,1,11,SUMMARY,line,second line,third line,2e-06,0.999998
2,11432511,ESSENTIAL FUNCTIONS,4,19,0,11,This is a Bilingual Receptionist position with...,summary,line,second line,0.988159,0.011841
3,11432511,reasonable accommodation enable individual dis...,5,103,0,11,ESSENTIAL FUNCTIONS,bilingual Receptionist position add responsibi...,summary,first line,0.996246,0.003754
4,11432511,responsible communication non english speak pa...,6,159,0,11,Reasonable accommodations may be made to enabl...,ESSENTIAL FUNCTIONS,bilingual Receptionist position add responsibi...,SUMMARY,0.988165,0.011835
5,11432511,follow current protocol place open close office,7,68,0,11,Responsible for communication with non-English...,reasonable accommodation enable individual dis...,ESSENTIAL FUNCTIONS,This is a Bilingual Receptionist position with...,0.997119,0.002881
6,11432511,greets patient polite prompt helpful manner,8,53,0,11,Follows current protocol in place for opening ...,responsible communication non english speak pa...,reasonable accommodation enable individual dis...,ESSENTIAL FUNCTIONS,0.99823,0.00177
7,11432511,provide necessary instruction direction need p...,9,65,0,11,"Greets patients in polite, prompt and helpful ...",follow current protocol place open close office,responsible communication non english speak pa...,Reasonable accommodations may be made to enabl...,0.998656,0.001344
8,11432511,inform appropriate department patient arrival,10,52,0,11,Provides necessary instructions/directions as ...,greets patient polite prompt helpful manner,follow current protocol place open close office,Responsible for communication with non-English...,0.996264,0.003736
9,11432511,complete necessary paperwork encounter form re...,11,111,0,11,Informs appropriate department of patient’s ar...,provide necessary instruction direction need p...,greets patient polite prompt helpful manner,Follows current protocol in place for opening ...,0.996896,0.003104
