In [0]:
# Import relevant packages

import numpy as np
import pandas as pd
import re

In [2]:
# Retrieve csv from drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
# Store dataset in pandas dataframe

master_df = pd.read_csv('/content/drive/My Drive/Consumer_Complaints.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
# Only keep rows with complaint narratives

proc_df = master_df.dropna(subset=['Consumer complaint narrative'])

In [0]:
# Take sample of even amounts of timely and un-timely responses

new_df = proc_df[proc_df['Timely response?']=='Yes'].sample(5000)

new_df2 = proc_df[proc_df['Timely response?']=='No'].sample(5000)

df = new_df.append(new_df2)

In [0]:
# Clean the data with regex, so that there are no numbers, special characters, or Xs

clean = [re.sub('[^A-WY-Za-z]', ' ', nar) for nar in df['Consumer complaint narrative']]

df['Cleaned narratives'] = [nar for nar in clean]

In [7]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

split_nars = [nar.split() for nar in df['Cleaned narratives']] # split up words in narratives

lem_nars = [[lemmatizer.lemmatize(word) for word in nar] for nar in split_nars] # lemmatize each word in each narrative

df['Cleaned narratives'] = [' '.join(nar) for nar in lem_nars] # adjoin list of words by spaces for each narrative

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
# Split the data into train and test data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Cleaned narratives'], 
                                                       df['Timely response?'], random_state=30)

In [0]:
# Import relevant packages, such as TfidfVectorizer, StratifiedKFold, etc. And then vectorize the text with Tfidf.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), min_df=3)

X_train = vectorizer.fit_transform(X_train)

X_test = vectorizer.transform(X_test)

In [0]:
# Define an f1 scorer to score the results

def f1_scorer(tp, fp, fn):

  precision = tp / (tp + fp)
  recall = tp / (tp + fn)

  f1 = (2 * precision * recall) / (precision + recall)

  return f1

In [11]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')

param_grid = {'C': [0.45, 0.5, 0.55, 0.6]}

cv_input = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid = GridSearchCV(svc, param_grid, cv=cv_input).fit(X_train, y_train)

print(f'Best param: {grid.best_params_}')

print(f'Train score: {grid.score(X_train, y_train)}')

print(f'Test score: {grid.score(X_test, y_test)}')

tn, fp, fn, tp = confusion_matrix(y_test, grid.predict(X_test)).ravel()

print(f'The F1 score is: {f1_scorer(tp, fp, fn)}')

Best param: {'C': 0.6}
Train score: 0.8713333333333333
Test score: 0.7004
The F1 score is: 0.6944104447164422
