In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

pd.set_option('display.max_colwidth', 200)

# Read Data and Clean it

In [2]:
train_path = 'data/converted_data_train.csv'
test_path = 'data/converted_data_test.csv'

train_df = pd.read_csv(train_path)[['converted_text', 'Complaint-Status']]
test_df = pd.read_csv(test_path)[['converted_text']]
train_df.head()

Unnamed: 0,converted_text,Complaint-Status
0,"Seterus, Inc. filed a false report with the major XXXX credit bureaus, stating that my XXXX 2015 mortgage was late when they were receiving full repayment of the loan on XXXX XXXX, 2015. It should...",Closed with explanation
1,XX / XX / XXXX Bankruptcy Claim XXXX of Chapter XXXX was filed in Hawaii and included the creditor XXXX XXXX for an unknown amount (Assumed $ 540.00 XXXX XX / XX / XXXX Chapter XXXX Bankruptcy Pai...,Closed with non-monetary relief
2,"XXXX / XXXX / 15, I was preparing the flight back to XXXX (see XXXX) for the funeral of a close relative and I stayed in a hotel at the XXXX airport. Before leaving, I hired a dog sitter and her f...",Closed with explanation
3,"The loan was paid in XXXX XXXX. In XXXX, 4 years after I moved from VA (I moved in XXXX XX / XX / XXXX-6 months after paying the debt), a representative XXXX XXXX XXXX contacted me suddenly, indic...",Closed with explanation
4,"I got a care credit account for XXXX. Immediately after I was wrongly charged by another doctor, I contacted him to let me know, when he told me that I was responsible for this charge and that I h...",Closed with explanation


In [3]:
train_df.dtypes

converted_text      object
Complaint-Status    object
dtype: object

In [4]:
train_df['clean_text'] = train_df['converted_text'].astype('str')
train_df['clean_text'] = train_df['clean_text'].apply(lambda x: re.sub(r'http\S+', '', x))
train_df['clean_text'] = train_df['clean_text'].apply(lambda x: re.sub(r'X+', '', x))

test_df['clean_text'] = test_df['converted_text'].astype('str')
test_df['clean_text'] = test_df['clean_text'].apply(lambda x: re.sub(r'http\S+', '', x))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x: re.sub(r'X+', '', x))

In [5]:
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~,'

train_df['clean_text'] = train_df['clean_text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

train_df['clean_text'] = train_df['clean_text'].str.lower()
test_df['clean_text'] = test_df['clean_text'].str.lower()

train_df['clean_text'] = train_df['clean_text'].str.replace("[0-9]", " ")
test_df['clean_text'] = test_df['clean_text'].str.replace("[0-9]", " ")

train_df['clean_text'] = train_df['clean_text'].apply(lambda x:' '.join(x.split()))
test_df['clean_text'] = test_df['clean_text'].apply(lambda x: ' '.join(x.split()))

In [6]:
del train_df['converted_text']
del test_df['converted_text']

train_df.head()

Unnamed: 0,Complaint-Status,clean_text
0,Closed with explanation,seterus inc. filed a false report with the major credit bureaus stating that my mortgage was late when they were receiving full repayment of the loan on . it should be noted that delay are not ass...
1,Closed with non-monetary relief,bankruptcy claim of chapter was filed in hawaii and included the creditor for an unknown amount assumed . chapter bankruptcy paid including
2,Closed with explanation,i was preparing the flight back to see for the funeral of a close relative and i stayed in a hotel at the airport. before leaving i hired a dog sitter and her friend who had referred me to people ...
3,Closed with explanation,the loan was paid in . in years after i moved from va i moved in months after paying the debt a representative contacted me suddenly indicating that my debt had not been paid and that i now owe th...
4,Closed with explanation,i got a care credit account for . immediately after i was wrongly charged by another doctor i contacted him to let me know when he told me that i was responsible for this charge and that i had to ...


In [25]:
train_df.to_csv('cleaned_data/train_data.csv', index=False)
test_df.to_csv('cleaned_data/test_data.csv', index=False)

# Get ELMO Embeddings for TRAIN data

In [None]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [None]:
X = np.zeros(1024)

for i in range(0, train_df.shape[0], 100):
    #np.save('embeddings/'+str(i), elmo_vectors(train_df['clean_text'].iloc[i:i+100]))
    X = np.vstack([X, elmo_vectors(train_df['clean_text'].iloc[i:i+100])])
    print (i)
    
# path = os.path.join(os.path.join(os.getcwd(), 'embeddings'), '*.npy')
# files=glob.glob(path)   
# for file in files: 
#     X = np.vstack([X, np.load(file)])

In [None]:
print (X.shape)
X = X[1:]

from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()
Y = lbl.fit_transform(train_df['Complaint-Status'])

# Performing Cross-Validation With 3 Folds

In [3]:
import xgboost as XGB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

model = XGB.XGBClassifier()

n_splits = 3
score = 0
skfolds = StratifiedKFold(n_splits=n_splits, random_state=42)

for train_index, test_index in skfolds.split(X, Y):
    X_train_folds = X[train_index]
    y_train_folds = Y[train_index]
    X_test_fold = X[test_index]
    y_test_fold = Y[test_index]
    
    model.fit(X_train_folds, y_train_folds)
    y_pred = model.predict(X_test_fold)
    local_score = f1_score(y_pred, y_test_fold, average='weighted')
    print (local_score)
    score += local_score

print (score/n_splits)

0.8841595656029829
0.8843937342873719
0.8844621513944223
0.8843384837615923
