In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Regression Models considered
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

# Pipeline to ensure no mising of train and test data
from sklearn.pipeline import Pipeline

# # Classification Metrics
from sklearn.metrics import confusion_matrix,classification_report

# # saving trained model with pickle
import pickle

# selecting common working idrectory to allow use of pre_processing.py
import os
os.chdir('/Users/DE7/Desktop/LearningPython/XYZ-Corp')

# return current time to track runs for fitting models
from datetime import datetime
def current_time():
    now = datetime.now()
    return now.strftime("%H:%M:%S")

In [2]:
current_time()

'16:34:14'

In [3]:
Data = pd.read_csv('data/XYZCorp_LendingData.txt',sep='\t', low_memory = False)

In [4]:
def loan_intention_group(row):
    if row['purpose'] == 'debt_consolidation':
        return 'debt_consolidation'
    elif row['purpose'] =='credit_card':
        return 'debt_consolidation' 
    elif row['purpose'] =='home_improvement':
        return 'house' 
    elif row['purpose'] =='other':
        return 'other' 
    elif row['purpose'] =='major_purchase':
        return 'debt_consolidation' 
    elif row['purpose'] =='small_business':
        return 'small_business' 
    elif row['purpose'] =='car':
        return 'car' 
    elif row['purpose'] =='house':
        return 'house' 
    elif row['purpose'] =='moving':
        return 'moving' 
    elif row['purpose'] =='vacation':
        return 'vacation' 
    elif row['purpose'] =='educational':
        return 'educational' 
    elif row['purpose'] =='medical':
        return 'medical' 
    elif row['purpose'] =='wedding':
        return 'wedding'
    elif row['purpose'] =='renewable_energy':
        return 'renewable_energy' 
    else:
        return 'other'

Data['loan_intention_group'] = Data.apply(loan_intention_group, axis=1)

In [5]:

Data = Data[~Data['desc'].isna()]
Data = Data[Data['desc']!='']
Data = Data[Data['purpose']!='other']
Data = Data[['desc','purpose','loan_intention_group']]
Data.desc = Data.desc.str.replace('Borrower added on \d\d/\d\d/\d\d > ','',regex=True)
Data.desc = Data.desc.str.replace('<(.*?)>','',regex=True)


X = Data[['desc']]
y = Data[['loan_intention_group']]

Data.groupby(['loan_intention_group','purpose']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,desc
loan_intention_group,purpose,Unnamed: 2_level_1
car,car,1955
debt_consolidation,credit_card,25595
debt_consolidation,debt_consolidation,69638
debt_consolidation,major_purchase,3151
educational,educational,291
house,home_improvement,7276
house,house,833
medical,medical,1007
moving,moving,791
renewable_energy,renewable_energy,130


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify = y)

In [7]:
pipeline2 = Pipeline([
        ('tfidf', TfidfVectorizer(analyzer='word', stop_words= 'english', ngram_range =(1,1))),
        ('multiclass',OneVsRestClassifier(RandomForestClassifier(bootstrap = True, n_estimators = 75, n_jobs=-1)))])

In [8]:
pipeline2.fit(list(X_train.desc), list(y_train.loan_intention_group))

In [9]:
pipeline2.score(list(X_test.desc), list(y_test.loan_intention_group))

0.9164688169552377

In [10]:
y_pred = pipeline2.predict(list(y_test.loan_intention_group)) 

print(classification_report(y_test,y_pred, zero_division=0))

                    precision    recall  f1-score   support

               car       1.00      1.00      1.00       587
debt_consolidation       0.97      1.00      0.98     29516
       educational       0.00      0.00      0.00        87
             house       1.00      1.00      1.00      2433
           medical       1.00      1.00      1.00       302
            moving       1.00      1.00      1.00       237
  renewable_energy       0.00      0.00      0.00        39
    small_business       0.00      0.00      0.00       794
          vacation       1.00      1.00      1.00       172
           wedding       1.00      1.00      1.00       371

          accuracy                           0.97     34538
         macro avg       0.70      0.70      0.70     34538
      weighted avg       0.95      0.97      0.96     34538



In [11]:
# save the NLP model
filename = 'nlp_problem/saved_models/one_v_rest_random_forest.sav'
pickle.dump(pipeline2, open(filename, 'wb'))

In [12]:
X_test.to_csv('nlp_problem/nlp_test_data/X_test.csv', sep = '|')
y_test.to_csv('nlp_problem/nlp_test_data/y_test.csv', sep = '|')

In [13]:
current_time()

'16:37:29'