In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from preprocess import preprocess_text

In [2]:
df = pd.read_csv("../../data/jdCleanData/pst2_df.csv")
# Load a SpaCy model
nlp = spacy.load('en_core_web_lg')

In [3]:
# Apply the function to the 'previous_line' column
columns_to_preprocess = ['text', 'lag_text','lag_text2', 'lead_text', 'lead_text2']

for column in columns_to_preprocess:
    df[column] = df[column].apply(lambda x:preprocess_text(x, nlp))

In [4]:
print(df.dtypes)

dsm                        int64
text                      object
previous_column_number     int64
str_len                    int64
is_ps                      int64
entity_encode              int64
lag_text                  object
lag_text2                 object
lead_text                 object
lead_text2                object
dtype: object


In [14]:
from sklearn.decomposition import TruncatedSVD  # <-- Import TruncatedSVDfrom sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier


# Define preprocessor
preprocessor_ps2 = ColumnTransformer(
    transformers=[
        ('text', CountVectorizer(), 'text'),
        ('lag_text', CountVectorizer(), 'lag_text'),
        ('lag_text2', CountVectorizer(), 'lag_text2'),
        ('lead_text', CountVectorizer(), 'lead_text'),
        ('lead_text2', CountVectorizer(), 'lead_text2'),
        ('num', StandardScaler(with_mean=False), ['str_len']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['entity_encode'])
    ]
)

# Create a complete pipeline which includes preprocessing followed by PCA
pipeline_ps3 = Pipeline([
    ('preprocess', preprocessor_ps2),
    ('reduce_dim', TruncatedSVD(n_components=500))  # Retain 95% of the variance
])

X = df[["text", 'previous_column_number', 'str_len',
        "lag_text", 'lag_text2',
        'lead_text','lead_text2',
        'entity_encode']]

y = df['is_ps']

# Fit and transform the data using the pipeline
X = pipeline_ps3.fit_transform(X)
dump(pipeline_ps3, 'preprocessors/ps3_pipeline_with_tsvd.joblib')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2)

# If you want to use a subset of the data for hyperparameter tuning:
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.2, random_state=42)  # Adjust 0.1 according to the desired subset size

# Defining base classifier
base_classifier = RandomForestClassifier()

# Wrap it with SelfTrainingClassifier
self_training_clf = SelfTrainingClassifier(base_classifier)

# Parameters for RandomizedSearch
param_distributions = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [5, 10],
    'base_estimator__min_samples_split': [2, 5, 10],
    'base_estimator__min_samples_leaf': [1, 2, 4],
    'base_estimator__bootstrap': [True, False]
}

# Setting up RandomizedSearch with Cross-Validation
random_search = RandomizedSearchCV(estimator=self_training_clf, param_distributions=param_distributions, 
                                   n_iter=50,  # Number of parameter settings that are sampled. Adjust this value.
                                   cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search_rf =random_search.fit(X_sample, y_sample)




Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=5, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=100; total time= 1.0min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=5, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time=  50.1s
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time= 1.7min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time=  51.3s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=10, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=5, base_es

[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=5, base_estimator__n_estimators=150; total time= 1.8min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=1, base_estimator__min_samples_split=10, base_estimator__n_estimators=150; total time= 2.3min
[CV] END base_estimator__bootstrap=False, base_estimator__max_depth=5, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=5, base_estimator__n_estimators=100; total time= 1.3min
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=10, base_estimator__min_samples_leaf=4, base_estimator__min_samples_split=2, base_estimator__n_estimators=50; total time=  54.6s
[CV] END base_estimator__bootstrap=True, base_estimator__max_depth=5, base_estimator__min_samples_leaf=2, base_estimator__min_samples_split=2, base_estimator__n_estimators=150; total time= 1.2min
[CV] END base_es

In [15]:
best_parameters_dict1 = random_search_rf.best_params_
print(best_parameters_dict1)

{'base_estimator__n_estimators': 150, 'base_estimator__min_samples_split': 2, 'base_estimator__min_samples_leaf': 1, 'base_estimator__max_depth': 10, 'base_estimator__bootstrap': False}


{'base_estimator__n_estimators': 150, 'base_estimator__min_samples_split': 2, 'base_estimator__min_samples_leaf': 1, 'base_estimator__max_depth': 10, 'base_estimator__bootstrap': False}

In [21]:
# Training the model with the best hyperparameters
model_for_eval2 = SelfTrainingClassifier(
    RandomForestClassifier(
        n_estimators=150,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=1,
        bootstrap=False,
    )
)

model_for_eval2.fit(X_train, y_train)

In [22]:
# Predicting with the model
y_pred = model_for_eval2.predict(X_test)


from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred), "\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8034799370973469 

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      3253
           0       0.80      1.00      0.89     15106
           1       0.89      0.55      0.68      1354

    accuracy                           0.80     19713
   macro avg       0.56      0.52      0.52     19713
weighted avg       0.67      0.80      0.73     19713
 

Confusion Matrix:
[[    0  3168    85]
 [    0 15095    11]
 [    0   610   744]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.3, random_state=42)  # Adjust 0.1 according to the desired subset size


In [29]:
from sklearn.svm import SVC



# Defining base classifiers
svc_linear = SVC(kernel="linear", probability=True)  # SVC requires 'probability=True' for self-training classifier




# Wrap it with SelfTrainingClassifier
self_training_clf_linear = SelfTrainingClassifier(svc_linear)

param_distribution_linear = {
    'base_estimator__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'base_estimator__tol': [1e-4, 1e-3, 1e-2, 1e-1],
    'base_estimator__max_iter': [100, 500, 1000, -1]  # -1 means no limit.
}


random_search_linear = RandomizedSearchCV(estimator=self_training_clf_linear, param_distributions=param_distribution_linear, 
                                   n_iter=50,  # Number of parameter settings that are sampled. Adjust this value.
                                   cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search_rf_linear =random_search_linear.fit(X_sample, y_sample)



Fitting 5 folds for each of 50 candidates, totalling 250 fits








[CV] END base_estimator__C=100, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.1min
[CV] END base_estimator__C=10, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 4.9min




[CV] END base_estimator__C=100, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.5min
[CV] END base_estimator__C=10, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 4.9min




[CV] END base_estimator__C=100, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.5min
[CV] END base_estimator__C=10, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 4.9min




[CV] END base_estimator__C=100, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.5min
[CV] END base_estimator__C=10, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 5.0min






[CV] END base_estimator__C=100, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.4min
[CV] END base_estimator__C=10, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 4.9min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time= 1.2min




[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 5.9min
[CV] END base_estimator__C=100, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 5.1min




[CV] END base_estimator__C=0.1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time= 1.6min
[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 5.1min




[CV] END base_estimator__C=0.1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time= 1.3min
[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 5.7min




[CV] END base_estimator__C=0.1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time= 1.1min
[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 5.9min




[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 6.7min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time=  51.6s
[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 6.4min




[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 3.7min
[CV] END base_estimator__C=100, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 6.5min
[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.1min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time=  27.5s




[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 3.9min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 5.6min




[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 5.2min
[CV] END base_estimator__C=100, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 3.9min
[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.0min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 5.7min




[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 4.7min
[CV] END base_estimator__C=100, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 4.6min
[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.0min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 5.8min




[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.3min
[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.5min




[CV] END base_estimator__C=100, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 3.9min
[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.0min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time=  37.1s
[CV] END base_estimator__C=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 6.7min




[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.2min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 4.5min




[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.1min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 9.5min




[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.4min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 5.4min










































[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 5.2min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 5.1min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time= 8.4min
[CV] END base_estimator__C=100, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.0min
[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.2min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 6.5min
[CV] END base_estimator__C=100, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 5.0min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=500, base_estimator__tol=0.1; total time= 4.7min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.1; total time= 9.4min
[CV] END base_estimator__C=0.



[CV] END base_estimator__C=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 4.3min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time= 6.9min
[CV] END base_estimator__C=1, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.2min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 5.8min
[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 4.2min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 6.7min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.1; total time= 9.2min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time= 5.1min
[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 4.0min
[CV] END base_estimato



[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 4.3min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time= 8.2min
[CV] END base_estimator__C=1, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.2min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 5.7min
[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 4.2min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time=10.7min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 5.2min
[CV] END base_estimator__C=1, base_estimator__max_iter=500, base_estimator__tol=0.1; total time= 4.2min
[CV] END base_estimator__C=10, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 5.0min
[CV] END base_estimator__C=0.









[CV] END base_estimator__C=1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time= 1.5min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.0min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time=  45.0s




[CV] END base_estimator__C=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 5.8min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 5.2min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 5.0min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time= 8.4min
[CV] END base_estimator__C=100, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.0min
[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.4min
[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.0min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 8.0min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 6.6min
[CV] END base_estimator__C=0.001,



[CV] END base_estimator__C=1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time= 1.5min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.3min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 5.2min








[CV] END base_estimator__C=1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time= 1.5min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.3min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time=  36.9s
[CV] END base_estimator__C=0.1, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 4.6min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.1; total time= 5.0min






[CV] END base_estimator__C=0.1, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 4.1min
[CV] END base_estimator__C=100, base_estimator__max_iter=1000, base_estimator__tol=0.1; total time= 5.0min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  46.7s
[CV] END base_estimator__C=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 6.5min






[CV] END base_estimator__C=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 5.2min
[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.5min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 5.2min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 6.5min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 5.5min
[CV] END base_estimator__C=1, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.2min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 5.7min
[CV] END base_estimator__C=10, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 4.1min
[CV] END base_estimator__C=100, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 5.1min
[CV] END base_estimator__C



[CV] END base_estimator__C=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 9.5min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 2.5min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 5.2min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 6.8min
[CV] END base_estimator__C=100, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.1min
[CV] END base_estimator__C=10, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.3min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 6.6min
[CV] END base_estimator__C=100, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 5.0min
[CV] END base_estimator__C=0.1, base_estimator__max_iter=500, base_estimator__tol=0.1; total time= 4.7min
[CV] END base_estimator__C



[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 4.5min
[CV] END base_estimator__C=100, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 5.0min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 6.3min




[CV] END base_estimator__C=0.001, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  37.8s
[CV] END base_estimator__C=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 6.8min
[CV] END base_estimator__C=0.001, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  26.1s
[CV] END base_estimator__C=0.001, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  37.7s
[CV] END base_estimator__C=0.01, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  42.6s
[CV] END base_estimator__C=0.01, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  17.7s
[CV] END base_estimator__C=0.001, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 2.5min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 5.9min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 5.5min
[CV] END base_estimator__C

[CV] END base_estimator__C=0.001, base_estimator__max_iter=500, base_estimator__tol=0.1; total time= 1.6min
[CV] END base_estimator__C=100, base_estimator__max_iter=1000, base_estimator__tol=0.1; total time= 5.0min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  35.7s
[CV] END base_estimator__C=0.01, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  27.5s
[CV] END base_estimator__C=10, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 5.9min
[CV] END base_estimator__C=100, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 5.0min
[CV] END base_estimator__C=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.0001; total time= 5.8min


In [33]:
best_parameters_dict2 = random_search_rf_linear.best_params_
print(best_parameters_dict2)

{'base_estimator__tol': 0.1, 'base_estimator__max_iter': -1, 'base_estimator__C': 0.1}


In [40]:
from sklearn.svm import SVC

# Extract parameters from the dictionary
C = best_parameters_dict2['base_estimator__C']
tol = best_parameters_dict2['base_estimator__tol']
max_iter = best_parameters_dict2['base_estimator__max_iter']

# Create and fit the SVC model
svc = SVC(kernel='linear', C=C, tol=tol, max_iter=max_iter, probability=True)
svc_linear_self = SelfTrainingClassifier(svc)
svc_linear_self.fit(X_train, y_train)  # Assuming X_train and y_train are your training data

In [41]:
# Predict on test data
y_pred = svc_linear_self.predict(X_test)

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred), "\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8224014609648456 

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      3253
           0       0.83      1.00      0.90     15106
           1       0.76      0.87      0.81      1354

    accuracy                           0.82     19713
   macro avg       0.53      0.62      0.57     19713
weighted avg       0.69      0.82      0.75     19713
 

Confusion Matrix:
[[    0  2948   305]
 [    0 15037    69]
 [    0   179  1175]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
ps_svc_linear_final =svc_linear_self.fit(X, y)
from joblib import dump

# Save the model using joblib
dump(ps_svc_linear_final, '../../joblib/ps_svc_linear_final.joblib')

['../../joblib/ps_svc_linear_final.joblib']

In [31]:
svc_rbf = SVC(kernel="rbf", probability=True)



# Wrap it with SelfTrainingClassifier
self_training_clf_rbf = SelfTrainingClassifier(svc_rbf)

param_distributions_rbf = {
    'base_estimator__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'base_estimator__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'base_estimator__tol': [1e-4, 1e-3, 1e-2, 1e-1],
    'base_estimator__max_iter': [100, 500, 1000, -1]  # -1 means no limit.
}



random_search_rbf = RandomizedSearchCV(estimator=self_training_clf_rbf, param_distributions=param_distributions_rbf, 
                                   n_iter=50,  # Number of parameter settings that are sampled. Adjust this value.
                                   cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search_rf_rbf =random_search_rbf.fit(X_sample, y_sample)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


















[CV] END base_estimator__C=0.01, base_estimator__gamma=auto, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 3.1min
[CV] END base_estimator__C=1, base_estimator__gamma=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 8.3min
[CV] END base_estimator__C=0.001, base_estimator__gamma=1, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  18.8s




[CV] END base_estimator__C=0.01, base_estimator__gamma=auto, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 3.9min
[CV] END base_estimator__C=1, base_estimator__gamma=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 9.7min




[CV] END base_estimator__C=0.01, base_estimator__gamma=auto, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 4.7min
[CV] END base_estimator__C=1, base_estimator__gamma=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 9.6min




[CV] END base_estimator__C=0.01, base_estimator__gamma=auto, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 3.1min
[CV] END base_estimator__C=1, base_estimator__gamma=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time=13.1min




[CV] END base_estimator__C=0.01, base_estimator__gamma=auto, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 3.1min
[CV] END base_estimator__C=1, base_estimator__gamma=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 7.0min
[CV] END base_estimator__C=0.001, base_estimator__gamma=auto, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 9.6min
[CV] END base_estimator__C=10, base_estimator__gamma=1, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 2.6min




























[CV] END base_estimator__C=0.001, base_estimator__gamma=1, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  19.0s
[CV] END base_estimator__C=1, base_estimator__gamma=1, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time=103.8min




[CV] END base_estimator__C=0.1, base_estimator__gamma=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 4.9min
[CV] END base_estimator__C=100, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  55.2s
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.5min
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.8min
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  58.5s
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  48.8s
[CV] END base_estimator__C=0.001, base_estimator__gamma=1, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  18.9s
[CV] END base_estimator__C=0.001, base_estimator__gamma=1,







[CV] END base_estimator__C=10, base_estimator__gamma=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time= 9.6min
[CV] END base_estimator__C=0.1, base_estimator__gamma=1, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 3.1min
[CV] END base_estimator__C=0.1, base_estimator__gamma=1, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 3.1min
[CV] END base_estimator__C=1, base_estimator__gamma=0.1, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 6.3min
[CV] END base_estimator__C=100, base_estimator__gamma=0.01, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 7.1min
[CV] END base_estimator__C=0.01, base_estimator__gamma=0.1, base_estimator__max_iter=500, base_estimator__tol=0.1; total time= 4.6min
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.1, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 6.6min
[CV] END base_estimator__C=0.01, base_estimator__gamma=0.01, 



[CV] END base_estimator__C=0.1, base_estimator__gamma=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 4.9min
[CV] END base_estimator__C=100, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.2min
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.8min
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.3min
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.1min
[CV] END base_estimator__C=0.001, base_estimator__gamma=auto, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 9.6min
[CV] END base_estimator__C=10, base_estimator__gamma=1, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 2.6min
[CV] END base_estimator__C=10, base_estimator__gamma=0.01





[CV] END base_estimator__C=0.1, base_estimator__gamma=0.001, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  39.2s
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.001, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  49.6s
[CV] END base_estimator__C=0.001, base_estimator__gamma=1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time=  19.0s
[CV] END base_estimator__C=0.001, base_estimator__gamma=1, base_estimator__max_iter=100, base_estimator__tol=0.001; total time=  18.8s
[CV] END base_estimator__C=0.001, base_estimator__gamma=scale, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 4.8min




[CV] END base_estimator__C=0.1, base_estimator__gamma=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 4.9min
[CV] END base_estimator__C=100, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.4min
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.6min
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.4min
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  58.4s
[CV] END base_estimator__C=0.001, base_estimator__gamma=auto, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time=11.5min
[CV] END base_estimator__C=10, base_estimator__gamma=0.01, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time=12.3min
[CV] END base_estimator__C=0.1, base_estimator__gamma=1,







[CV] END base_estimator__C=0.1, base_estimator__gamma=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 4.9min
[CV] END base_estimator__C=100, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.1min
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.8min
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.0001; total time= 1.7min
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.1min
[CV] END base_estimator__C=0.001, base_estimator__gamma=auto, base_estimator__max_iter=-1, base_estimator__tol=0.001; total time= 9.5min
[CV] END base_estimator__C=10, base_estimator__gamma=1, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 1.7min
[CV] END base_estimator__C=10, base_estimator__gamma=0.01



[CV] END base_estimator__C=100, base_estimator__gamma=0.001, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 5.3min
[CV] END base_estimator__C=10, base_estimator__gamma=0.01, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 7.5min
[CV] END base_estimator__C=100, base_estimator__gamma=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.1; total time=10.3min
[CV] END base_estimator__C=0.001, base_estimator__gamma=0.1, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time= 4.2min
[CV] END base_estimator__C=0.001, base_estimator__gamma=scale, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 7.2min
[CV] END base_estimator__C=100, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time= 1.1min
[CV] END base_estimator__C=0.01, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  28.6s
[CV] END base_estimator__C=0.01, base_estimator__ga





[CV] END base_estimator__C=0.01, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  29.2s
[CV] END base_estimator__C=1, base_estimator__gamma=1, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time=38.5min
[CV] END base_estimator__C=1, base_estimator__gamma=0.01, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.1min
[CV] END base_estimator__C=0.01, base_estimator__gamma=0.1, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 5.3min
[CV] END base_estimator__C=0.01, base_estimator__gamma=scale, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time=11.5min




[CV] END base_estimator__C=0.1, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  35.5s
[CV] END base_estimator__C=0.1, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  26.8s
[CV] END base_estimator__C=0.1, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  44.2s
[CV] END base_estimator__C=0.1, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.0min
[CV] END base_estimator__C=0.1, base_estimator__gamma=auto, base_estimator__max_iter=100, base_estimator__tol=0.01; total time= 1.3min
[CV] END base_estimator__C=0.1, base_estimator__gamma=scale, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  50.0s




[CV] END base_estimator__C=0.001, base_estimator__gamma=scale, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 4.8min
[CV] END base_estimator__C=0.001, base_estimator__gamma=scale, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 6.3min
[CV] END base_estimator__C=100, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  59.1s
[CV] END base_estimator__C=100, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  58.9s
[CV] END base_estimator__C=1, base_estimator__gamma=1, base_estimator__max_iter=-1, base_estimator__tol=0.1; total time=27.8min
[CV] END base_estimator__C=1, base_estimator__gamma=0.01, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  37.5s
[CV] END base_estimator__C=1, base_estimator__gamma=0.01, base_estimator__max_iter=100, base_estimator__tol=0.01; total time=  46.0s
[CV] END base_estimator__C=0.01, base_estimator__gamma=0.1, bas



[CV] END base_estimator__C=0.01, base_estimator__gamma=scale, base_estimator__max_iter=-1, base_estimator__tol=0.01; total time=19.7min
[CV] END base_estimator__C=100, base_estimator__gamma=1, base_estimator__max_iter=1000, base_estimator__tol=0.1; total time= 5.0min




[CV] END base_estimator__C=1, base_estimator__gamma=1, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time=44.2min
[CV] END base_estimator__C=0.001, base_estimator__gamma=0.01, base_estimator__max_iter=500, base_estimator__tol=0.1; total time= 1.6min
[CV] END base_estimator__C=100, base_estimator__gamma=0.001, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time= 5.2min
[CV] END base_estimator__C=0.01, base_estimator__gamma=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 5.0min
[CV] END base_estimator__C=0.001, base_estimator__gamma=1, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 2.6min
[CV] END base_estimator__C=10, base_estimator__gamma=auto, base_estimator__max_iter=500, base_estimator__tol=0.001; total time= 4.5min
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.001, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 2.6min
[CV] END base_estimator__C=100, base_estimator__ga



[CV] END base_estimator__C=1, base_estimator__gamma=1, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time=102.0min
[CV] END base_estimator__C=100, base_estimator__gamma=0.001, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 5.3min
[CV] END base_estimator__C=10, base_estimator__gamma=0.01, base_estimator__max_iter=500, base_estimator__tol=0.0001; total time= 7.5min
[CV] END base_estimator__C=100, base_estimator__gamma=0.01, base_estimator__max_iter=1000, base_estimator__tol=0.1; total time=10.5min
[CV] END base_estimator__C=0.001, base_estimator__gamma=scale, base_estimator__max_iter=1000, base_estimator__tol=0.01; total time= 4.8min
[CV] END base_estimator__C=0.001, base_estimator__gamma=scale, base_estimator__max_iter=500, base_estimator__tol=0.01; total time= 8.0min
[CV] END base_estimator__C=0.01, base_estimator__gamma=0.1, base_estimator__max_iter=100, base_estimator__tol=0.1; total time=  48.8s
[CV] END base_estimator__C=1, base_estimator__ga

[CV] END base_estimator__C=1, base_estimator__gamma=1, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time=75.3min
[CV] END base_estimator__C=10, base_estimator__gamma=0.001, base_estimator__max_iter=1000, base_estimator__tol=0.001; total time= 6.5min
[CV] END base_estimator__C=0.1, base_estimator__gamma=0.1, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time=49.7min
[CV] END base_estimator__C=100, base_estimator__gamma=1, base_estimator__max_iter=-1, base_estimator__tol=0.0001; total time=125.7min


In [32]:
best_parameters_dict3 = random_search_rf_rbf.best_params_
print(best_parameters_dict3)

{'base_estimator__tol': 0.1, 'base_estimator__max_iter': -1, 'base_estimator__gamma': 0.01, 'base_estimator__C': 10}


In [48]:
from sklearn.svm import SVC

# Extract parameters from the dictionary
C = best_parameters_dict3['base_estimator__C']
tol = best_parameters_dict3['base_estimator__tol']
max_iter = best_parameters_dict3['base_estimator__max_iter']
gamma = best_parameters_dict3['base_estimator__gamma']

# Create and fit the SVC model with rbf kernel
svc_rbf = SVC(kernel='rbf', C=C, tol=tol, max_iter=max_iter, gamma=gamma, probability=True)
svc_rbf_self_learning = SelfTrainingClassifier(svc_rbf)
svc_rbf_self_learning.fit(X_train, y_train)  # Assuming X_train and y_train are your training data


In [49]:
# Predict on test data
y_pred2 = svc_rbf_self_learning.predict(X_test)

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred2), "\n")
print(classification_report(y_test, y_pred2), "\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred2))

Accuracy: 0.8242783949677878 

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      3253
           0       0.83      1.00      0.91     15106
           1       0.75      0.88      0.81      1354

    accuracy                           0.82     19713
   macro avg       0.53      0.63      0.57     19713
weighted avg       0.69      0.82      0.75     19713
 

Confusion Matrix:
[[    0  2905   348]
 [    0 15057    49]
 [    0   162  1192]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
svc_rbf_self_learning =svc_rbf_self_learning.fit(X, y)
from joblib import dump

# Save the model using joblib
dump(svc_rbf_self_learning, '../../joblib/ps_svc_rbf_final.joblib')

KeyboardInterrupt: 