In [26]:
import json
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import model_selection, preprocessing, linear_model, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from scipy.stats import loguniform,uniform,randint
import warnings
warnings.filterwarnings("ignore")


ml_pipeline_config = open('../../config/ml_pipeline.json')
ml_pipeline_config = json.load(ml_pipeline_config)

In [27]:
data = pd.read_csv(ml_pipeline_config["processed_data_path"])
data = data[['product','consumer_complaint_narrative']]
data = data[data['consumer_complaint_narrative'].notnull()]

In [28]:
data["product"].value_counts()

product
Credit card or prepaid card                                                     2109
Debt collection                                                                 2109
Credit reporting, credit repair services, or other personal consumer reports    2109
Checking or savings account                                                     2109
Money transfer, virtual currency, or money service                              2109
Student loan                                                                    2109
Payday loan, title loan, personal loan, or advance loan                         2109
Vehicle loan or lease                                                           2109
Mortgage                                                                        2109
Name: count, dtype: int64

In [29]:
x = data['consumer_complaint_narrative']
y = data['product']
trainx,testx,trainy,testy = train_test_split(x,y,train_size=ml_pipeline_config["train_size"],random_state=30)

In [30]:
trainy.shape, trainx.shape

((15184,), (15184,))

In [31]:
testy.shape, testx.shape

((3797,), (3797,))

In [32]:
encoder = preprocessing.LabelEncoder()
trainy = encoder.fit_transform(trainy)
joblib.dump(encoder, '../../models/label_encoder.joblib')
testy = encoder.transform(testy)

tfidf_vect = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=5000)
tfidf_vect.fit(data['consumer_complaint_narrative'])
xtrain_tfidf = tfidf_vect.transform(trainx) 
xtest_tfidf = tfidf_vect.transform(testx)
joblib.dump(tfidf_vect, '../../models/vectorizer.joblib')

['../../models/vectorizer.joblib']

In [33]:
trainy

array([4, 2, 3, ..., 8, 2, 2])

In [34]:
testy

array([7, 2, 7, ..., 0, 3, 1])

In [35]:
encoder.inverse_transform([0])[0]

'Checking or savings account'

In [37]:
logistic_model = linear_model.LogisticRegression()
logistic_model.fit(xtrain_tfidf,trainy)
logistic_accuracy = metrics.accuracy_score(logistic_model.predict(xtest_tfidf),testy)
logistic_accuracy

0.7974716881748749

In [38]:
## with RandomSearchCV
param_dist = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': loguniform(1e-4, 1e4),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 400, 500]
}

logistic_model = linear_model.LogisticRegression()
random_search = RandomizedSearchCV(logistic_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(xtrain_tfidf,trainy)
best_logistic_model = random_search.best_estimator_
logistic_accuracy = metrics.accuracy_score(best_logistic_model.predict(xtest_tfidf),testy)
logistic_accuracy

0.7898340795364762

In [39]:
naive_model = MultinomialNB()
naive_model.fit(xtrain_tfidf,trainy)
naive_accuracy = metrics.accuracy_score(naive_model.predict(xtest_tfidf),testy)
naive_accuracy

0.7582301817224124

In [40]:
param_dist = {
    'alpha': uniform(0.1, 2.0),  # Smoothing parameter (Laplace smoothing)
}
naive_model = MultinomialNB()
random_search = RandomizedSearchCV(naive_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(xtrain_tfidf,trainy)
best_naive_model = random_search.best_estimator_
naive_accuracy = metrics.accuracy_score(best_naive_model.predict(xtest_tfidf),testy)
naive_accuracy

0.7640242296549907

In [41]:
random_search.best_params_

{'alpha': 0.21616722433639893}

In [46]:
svm_model = SVC(kernel='linear', C=1.0, random_state=42,probability=True)
svm_model.fit(xtrain_tfidf,trainy)
svm_accuracy = metrics.accuracy_score(svm_model.predict(xtest_tfidf),testy)
svm_accuracy

0.790887542796945

In [91]:
param_dist = {
    'C': uniform(0.1, 2.0),  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Choice of kernel
    'gamma': ['scale', 'auto'] + list(uniform(0.1, 2.0).rvs(5)),  # For 'rbf', 'poly', 'sigmoid'
    'probability':[True]
}

svm_model = SVC()
random_search = RandomizedSearchCV(svm_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(xtrain_tfidf,trainy)
best_svm_model = random_search.best_estimator_
svm_accuracy = metrics.accuracy_score(best_svm_model.predict(xtest_tfidf),testy)
svm_accuracy

0.8698390482855144

In [43]:
xgboost_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgboost_model.fit(xtrain_tfidf,trainy)
xgboost_accuracy = metrics.accuracy_score(xgboost_model.predict(xtest_tfidf),testy)
xgboost_accuracy

0.781669739267843

In [27]:
param_dist = {
    'learning_rate': uniform(0.01, 0.2),
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
}

xgboost_model = XGBClassifier()
random_search = RandomizedSearchCV(xgboost_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(xtrain_tfidf,trainy)
best_xgboost_model = random_search.best_estimator_
xgboost_accuracy = metrics.accuracy_score(best_xgboost_model.predict(xtest_tfidf),testy)
xgboost_accuracy

0.8509447165850245

In [44]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(xtrain_tfidf,trainy)
rf_accuracy = metrics.accuracy_score(rf_model.predict(xtest_tfidf),testy)
rf_accuracy

0.7671846194363972

In [36]:
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}
print(param_dist['n_estimators'])
rf_model = RandomForestClassifier()
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(xtrain_tfidf,trainy)
best_rf_model = random_search.best_estimator_
rf_accuracy = metrics.accuracy_score(best_rf_model.predict(xtest_tfidf),testy)
rf_accuracy

<scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x0000026FC4E835D0>


0.6969909027291813

In [92]:
best_accuracy = {}
temp = [best_logistic_model,best_naive_model,best_svm_model,best_xgboost_model,best_rf_model]
best_accuracy["best_logistic_model"]=[logistic_accuracy,best_logistic_model]
best_accuracy["best_naive_model"]=[naive_accuracy,best_naive_model]
best_accuracy["best_svm_model"]=[svm_accuracy,best_svm_model]
best_accuracy["best_xgboost_model"]=[xgboost_accuracy,best_xgboost_model]
best_accuracy["best_rf_model"]=[rf_accuracy,best_rf_model]
best_accuracy

{'best_logistic_model': [0.8656403079076277,
  LogisticRegression(C=6.4405075539937195, max_iter=300, solver='saga')],
 'best_naive_model': [0.8376487053883834,
  MultinomialNB(alpha=0.21616722433639893)],
 'best_svm_model': [0.8698390482855144,
  SVC(C=1.5639878836228103, gamma=1.6827500337940122, kernel='linear',
      probability=True)],
 'best_xgboost_model': [0.8509447165850245,
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=0.9828160165372797, device=None,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric=None, feature_types=None, gamma=None,
                grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.17167946962329225,
                max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=3, max_leaves=None,
   

In [93]:
best_model_name = max(best_accuracy, key=lambda key: best_accuracy[key][0])
best_model_name

'best_svm_model'

In [94]:
best_model_accuracy = best_accuracy[best_model_name][0]
best_model = best_accuracy[best_model_name][1]

In [47]:
joblib.dump(svm_model, '../../models/best_model.joblib')

['../../models/best_model.joblib']

In [97]:
temp = [best_logistic_model,best_naive_model,best_svm_model,best_xgboost_model,best_rf_model]

In [100]:
best_logistic_model.predict_proba(xtest_tfidf[0])[0]

array([0.90923325, 0.00599117, 0.01552748, 0.00466943, 0.00142807,
       0.05075722, 0.00493174, 0.00122459, 0.00143169, 0.00111141,
       0.00369397])

In [105]:
text= "I received my credit report and noticed Elevate Recoveries , LLC placed an open collection on my report in the amount {$2200.00}.  First, I do not have a legal contract with this company. Secondly, Elevate Recoveries is in violation of consumer law code 15 USC 1681a Section 3. Per 15 USC 1681a Section 3 Restriction on sharing of medical information.Except for information or any communication of information disclosed as provided in section 1681b ( g ) ( 3 ) of this title, the exclusions in paragraph ( 2 ) shall not apply with respect to information disclosed to any person related by common ownership or affiliated by corporate control, if the information is ( A ) medical information ; ( B ) an individualized list or description based on the payment transactions of the consumer for medical products or services ; or ( C ) an aggregate list of identified consumers based on payment transactions for medical products or services.  This company has also violated H.R.2537 - Consumer Protection for Medical Debt Collecti"
text_vectorized = encoder.transform([text])
# Make predictions
prediction = best_model.predict(text_vectorized)[0]
score = max(best_model.predict_proba(text_vectorized)[0])
new = encoder.inverse_transform([prediction])[0]
score

ValueError: y contains previously unseen labels: 'I received my credit report and noticed Elevate Recoveries , LLC placed an open collection on my report in the amount {$2200.00}.  First, I do not have a legal contract with this company. Secondly, Elevate Recoveries is in violation of consumer law code 15 USC 1681a Section 3. Per 15 USC 1681a Section 3 Restriction on sharing of medical information.Except for information or any communication of information disclosed as provided in section 1681b ( g ) ( 3 ) of this title, the exclusions in paragraph ( 2 ) shall not apply with respect to information disclosed to any person related by common ownership or affiliated by corporate control, if the information is ( A ) medical information ; ( B ) an individualized list or description based on the payment transactions of the consumer for medical products or services ; or ( C ) an aggregate list of identified consumers based on payment transactions for medical products or services.  This company has also violated H.R.2537 - Consumer Protection for Medical Debt Collecti'

In [114]:
round(float(2.841446621579624e-05),3)

0.0

In [109]:
float("8.99284722486562e-02")

0.0899284722486562