In [6]:
import numpy as np
import pandas as pd 
import spacy

# Load a SpaCy model
nlp = spacy.load('en_core_web_lg')

df = pd.read_csv("../../data/jdCleanData/isq_train.csv")
df['is_qualification'] = df['is_qualification'].replace([1, 2], 5)


df.head()

Unnamed: 0,dsm,text,previous_column_number,str_len,ksa_identifier,count,is_qualification,double,previous_line,line_behind2,entity_encode,lag_text,lag_text2,lag_text3
0,30000420,Knowledge:,2,10,other,1,0,False,none,none,30,first line,second line,third line
1,30000420,"Requires basic computer knowledge (ie. Word, E...",3,51,Knowledge,2,5,False,Knowledge:,none,30,Knowledge:,first line,second line
2,30000420,Requires basic math skills,4,26,Knowledge,3,5,False,"Requires basic computer knowledge (ie. Word, E...",Knowledge:,30,"Requires basic computer knowledge (ie. Word, E...",Knowledge:,first line
3,30000420,Literate in basic medical terminology,5,37,Knowledge,4,5,False,Requires basic math skills,"Requires basic computer knowledge (ie. Word, E...",30,Requires basic math skills,"Requires basic computer knowledge (ie. Word, E...",Knowledge:
4,30000420,Knowledge of Johns Hopkins Health System and/o...,6,96,Knowledge,5,5,False,Literate in basic medical terminology,Requires basic math skills,30,Literate in basic medical terminology,Requires basic math skills,"Requires basic computer knowledge (ie. Word, E..."


In [7]:
from spacy.lang.en.stop_words import STOP_WORDS



# Define a function to preprocess text
def preprocess_text(text):
    # Create a Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas if lemma.isalpha() and lemma not in STOP_WORDS]
    
    return ' '.join(a_lemmas)



In [9]:
columns_to_preprocess = ['text', 'lag_text','lag_text2', 'lag_text3']

for column in columns_to_preprocess:
    df[column] = df[column].apply(preprocess_text)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text', CountVectorizer(), 'text'),
        ('lag_text', CountVectorizer(), 'lag_text'),
        ('lag_text2', CountVectorizer(), 'lag_text2'),
        ('lag_text3', CountVectorizer(), 'lag_text3'),
        ('num', StandardScaler(with_mean=False), ['str_len']),
        ('seq', OneHotEncoder(), ['previous_column_number']),
        ('cat', OneHotEncoder(), ['entity_encode'])
    ]
)

X = df[["text", 'previous_column_number', 'str_len', "lag_text", 'lag_text2', 'lag_text3',  'entity_encode']]
y = df['is_qualification']

# Fit and transform the data
X = preprocessor.fit_transform(X)


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=10)


In [248]:
# from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier

# base_classifier = MultinomialNB() #Supervised model; Naive Bayes


# #Self training classifier
# self_training_model = SelfTrainingClassifier(base_classifier)
# self_training_model.fit(X_train, y_train)


# from sklearn.metrics import classification_report

# predictions = self_training_model.predict(X_test)
# print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        86
           0       0.88      0.95      0.91       592
           3       0.72      0.92      0.81        25
           4       0.66      1.00      0.79        25
           5       0.37      0.56      0.44        39

    accuracy                           0.82       767
   macro avg       0.52      0.69      0.59       767
weighted avg       0.74      0.82      0.78       767



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier

In [18]:
from sklearn.model_selection import GridSearchCV
# define your class weight dictionary
class_weight = {0: 1, 3: 5, 4:5, 5:2}

# define the parameter values that should be searched
param_grid = {'base_estimator__C': [0.1, 1, 10, 100, 1000], 
              'base_estimator__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'base_estimator__kernel': ['rbf'], 
              'base_estimator__class_weight': [class_weight]} 

base_classifier = SVC(probability=True)
self_training_model = SelfTrainingClassifier(base_classifier)

grid = GridSearchCV(self_training_model, param_grid, refit=True, verbose=3)

# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)



Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END base_estimator__C=0.1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=1, base_estimator__kernel=rbf;, score=0.818 total time=   8.9s
[CV 2/5] END base_estimator__C=0.1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=1, base_estimator__kernel=rbf;, score=0.815 total time=   8.7s
[CV 3/5] END base_estimator__C=0.1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=1, base_estimator__kernel=rbf;, score=0.815 total time=   8.8s
[CV 4/5] END base_estimator__C=0.1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=1, base_estimator__kernel=rbf;, score=0.815 total time=   8.6s
[CV 5/5] END base_estimator__C=0.1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=1, base_estimator__kernel=rbf;, score=0.815 total time=   8.7s
[CV 1/5] END base_estimator__C=0.1, base_estimator__class_

[CV 2/5] END base_estimator__C=1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.0001, base_estimator__kernel=rbf;, score=0.848 total time=   8.1s
[CV 3/5] END base_estimator__C=1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.0001, base_estimator__kernel=rbf;, score=0.851 total time=   4.0s
[CV 4/5] END base_estimator__C=1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.0001, base_estimator__kernel=rbf;, score=0.853 total time=   6.7s
[CV 5/5] END base_estimator__C=1, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.0001, base_estimator__kernel=rbf;, score=0.842 total time=   6.7s
[CV 1/5] END base_estimator__C=10, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=1, base_estimator__kernel=rbf;, score=0.829 total time=   8.7s
[CV 2/5] END base_estimator__C=10, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gam

[CV 4/5] END base_estimator__C=100, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.001, base_estimator__kernel=rbf;, score=0.886 total time=   3.7s
[CV 5/5] END base_estimator__C=100, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.001, base_estimator__kernel=rbf;, score=0.867 total time=   3.6s
[CV 1/5] END base_estimator__C=100, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.0001, base_estimator__kernel=rbf;, score=0.883 total time=   2.8s
[CV 2/5] END base_estimator__C=100, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.0001, base_estimator__kernel=rbf;, score=0.878 total time=   4.1s
[CV 3/5] END base_estimator__C=100, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_estimator__gamma=0.0001, base_estimator__kernel=rbf;, score=0.883 total time=   2.8s
[CV 4/5] END base_estimator__C=100, base_estimator__class_weight={0: 1, 3: 5, 4: 5, 5: 2}, base_e

In [19]:
 # Getting the best parameters for SVC
best_C = grid.best_params_['base_estimator__C']
best_gamma = grid.best_params_['base_estimator__gamma']
best_kernel = grid.best_params_['base_estimator__kernel']

print(best_C,best_gamma, best_kernel, class_weight)


10 0.001 rbf {0: 1, 3: 5, 4: 5, 5: 2}


In [20]:
# Get best parameters
best_params = grid.best_params_

# Define a new classifier with the best parameters
base_classifier = SVC(C=best_C, 
                      gamma= best_gamma, 
                      kernel=best_kernel, 
                      class_weight=class_weight,
                      probability=True)

# Self training classifier
self_training_model = SelfTrainingClassifier(base_classifier)

# Fit the model
self_training_model.fit(X_train, y_train)


In [22]:
# Make predictions
predictions = self_training_model.predict(X_test)

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Output classification report
print("Classification Report:")
print(classification_report(y_test, predictions))

# Output confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

# Output accuracy score
print("Accuracy Score:")
print(accuracy_score(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        82
           0       0.86      0.99      0.92       550
           3       0.85      1.00      0.92        23
           4       0.85      0.92      0.88        24
           5       0.79      0.62      0.70        37

    accuracy                           0.86       716
   macro avg       0.67      0.71      0.68       716
weighted avg       0.76      0.86      0.80       716

Confusion Matrix:
[[  0  74   2   2   4]
 [  0 546   1   1   2]
 [  0   0  23   0   0]
 [  0   2   0  22   0]
 [  0  12   1   1  23]]
Accuracy Score:
0.8575418994413407


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
#After evaluation, fit to all the data
final_model =self_training_model.fit(X, y)

In [30]:
from joblib import dump

# Save the trained model
dump(final_model, '../../joblib/qualifications_rfm.joblib')

['../../joblib/qualifications_rfm.joblib']