In [2]:
import numpy as np
import pandas as pd 
import spacy
from preprocess import preprocess_text
from spacy.lang.en.stop_words import STOP_WORDS

# Load a SpaCy model
nlp = spacy.load('en_core_web_lg')

df = pd.read_csv("../../data/jdCleanData/isq_train_final.csv")
df['is_qualification'] = df['is_qualification'].replace([1, 2], 5)
df.head()

Unnamed: 0,dsm,text,previous_column_number,str_len,ksa_identifier,count,is_qualification,double,entity_encode
0,10000900,Knowledge: Maintains familiarity with various ...,3,203,other,1,5,False,10
1,10000900,Skills: Medical terminology and basic computer...,4,460,Skills,2,5,False,10
2,10000900,"Required Licensure Certification, etc.None.",5,43,Licensure,3,3,False,10
3,10000900,Work Experience: Requires a minimum of six mon...,6,161,Experience,4,4,False,10
4,10000900,"Machines, Tools, Equipment: Must be able to op...",7,101,mte,5,0,False,10


In [3]:
df["text"] = df["text"].apply(lambda x:preprocess_text(x, nlp))
df["text2"] = df["text"].apply(lambda x:preprocess_text(x, nlp))

In [4]:
df.dtypes

dsm                        int64
text                      object
previous_column_number     int64
str_len                    int64
ksa_identifier            object
count                      int64
is_qualification           int64
double                      bool
entity_encode              int64
text2                     object
dtype: object

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from joblib import dump

count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

# Pipeline for CountVectorizer
count_vect_pipeline = Pipeline([
    ('countvectorizer', count_vectorizer)
])

# Pipeline for TFIDF
tfidf_pipeline = Pipeline([
    ('countvectorizer', count_vectorizer),
    ('tfidf', tfidf_transformer)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('countvectorizer', count_vect_pipeline, 'text'),
        ('tfidf', tfidf_pipeline, 'text2'),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['entity_encode'])
    ],
    remainder='passthrough'  # The 'str_len' column will be passed through without transformation
)

X = df[["text", "text2", 'str_len', 'entity_encode']]
y = df['is_qualification']

# Fit the preprocessor on your training data and then transform it
X_processed = preprocessor.fit_transform(X)

# Now, dump the fitted preprocessor
dump(preprocessor, 'preprocessors/quali_processor2.joblib')



['preprocessors/quali_processor2.joblib']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=10)

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.semi_supervised import SelfTrainingClassifier

# Given class weights
class_weight = {0: 1, 3: 5, 4:5, 5:2}

base_classifier = RandomForestClassifier(class_weight=class_weight, random_state=42)

self_training_model = SelfTrainingClassifier(base_classifier, criterion='k_best', k_best=10, max_iter=10)

# Hyperparameter tuning using grid search with cross-validation


param_grid = {
    'base_estimator__n_estimators': [50, 100, 200],
    'base_estimator__max_depth': [200, 300, None],
    'base_estimator__min_samples_split': [2, 5, 10],
    'base_estimator__min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(self_training_model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)


In [35]:
best_params = grid_search.best_params_
print("Best hyperparameters:")
for param_name, param_value in best_params.items():
    print(f"{param_name}: {param_value}")


Best hyperparameters:
base_estimator__max_depth: 300
base_estimator__min_samples_leaf: 1
base_estimator__min_samples_split: 2
base_estimator__n_estimators: 50


In [10]:
# Training the model with the best hyperparameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.semi_supervised import SelfTrainingClassifier
eval_base_classifier =RandomForestClassifier(
        n_estimators=50,
        max_depth=300,
        min_samples_split=2,
        min_samples_leaf=1
    )
model_for_eval = SelfTrainingClassifier(eval_base_classifier, criterion='k_best', k_best=10, max_iter=10
    
)

# model_for_eval.fit(X_train, y_train)

In [37]:
# Predicting with the model

y_pred = model_for_eval.predict(X_test)


from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred), "\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.689659039928219 

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      2552
           0       0.72      0.99      0.83      5129
           3       0.94      0.97      0.95       129
           4       0.54      0.97      0.70        91
           5       0.55      0.83      0.66      1015

    accuracy                           0.69      8916
   macro avg       0.55      0.75      0.63      8916
weighted avg       0.49      0.69      0.58      8916
 

Confusion Matrix:
[[   0 1847    2   54  649]
 [   0 5097    3    0   29]
 [   0    2  125    0    2]
 [   0    1    0   88    2]
 [   0  153    3   20  839]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
#Use all data to train model
best_model_quali = model_for_eval.fit(X_processed, y)

In [14]:
from joblib import dump

# Save the model using joblib
dump(best_model_quali, '../../joblib/best_model_quali.joblib')

['../../joblib/best_model_quali.joblib']