In [15]:
import pandas as pd

import numpy as np

from fuzzywuzzy import fuzz

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix

import re

import string

import warnings

warnings.filterwarnings("ignore")

Summary: Achieved high accuracy levels (>0.99) through pre-processing,
         feature creation, and a Random Forest model with minimal tuning.

Future work: Minimize classification error rate using machine learning
             techniques and existing feature set.


## Pre-processing

In [2]:
# Read in the data

train = pd.DataFrame.from_csv('train.csv')

test = pd.DataFrame.from_csv('test.csv')

# Reset index 

train = train.reset_index(drop=True)

test = test.reset_index(drop=True)

0.39554185439886175

In [7]:
# Balance check: ~ 0.40 of train cases match, similar proportion in test df

sum(train['x_id'] == train['y_id'])/len(train)

0.39582592197320055

In [8]:
sum(test['x_id'] == test['y_id'])/len(test)

0.39554185439886175

In [9]:
# Check for null values: none found

train.isnull().sum()

x_description    0
x_id             0
y_description    0
y_id             0
dtype: int64

In [10]:
test.isnull().sum()

x_description    0
x_id             0
y_description    0
y_id             0
dtype: int64

In [16]:
# Split into feature set and response

x_train = train[['x_description', 'y_description']]

x_test = test[['x_description', 'y_description']]

y_train = train['x_id'] == train['y_id']

y_test = test['x_id'] == test['y_id']

# Convert training sets to lowercase and remove punctuation

def tidy(text):
    text = text.lower()
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')
    return(text)

x_train = x_train.applymap(tidy)

x_test = x_test.applymap(tidy)

# Remove commonly used terms (with regex), extra spaces
word_list = ['(^| )inc($| )','(^| )incorporated($| )', '(^| )fund($| )', '(^| )fd($| )', 
             '(^| )ltd($| )', '(^| )limited($| )', '(^| )corporation($| )', '(^| )corp($| )', \
             '(^| )holdings($| )', '(^| )trust($| )', '(^| )company($| )', '(^| )co($| )', \
             '(^| )international($| )', '(^| )the($| )',  '(^| )class($| )',  '(^| )cl($| )', \
             '(^| )international($| )']

def remove_words(text, word_list):
    words = '|'.join(word_list)
    text = re.sub(words, ' ', text)
    text = re.sub('  ', ' ', text)
    text = re.sub('(^ | $)', '', text)
    return(text)
    
x_train = x_train.applymap(lambda x: remove_words(x, word_list))

x_test = x_test.applymap(lambda x: remove_words(x, word_list))

## Feature Creation

In [13]:
# Absolute value of the difference in the number of space-delimited terms of x,y

x_train['num_words'] = x_train.apply(lambda row: abs(len(str.split(row['x_description'])) - \
      len(str.split(row['y_description']))), axis = 1)

x_test['num_words'] = x_test.apply(lambda row: abs(len(str.split(row['x_description'])) - \
      len(str.split(row['y_description']))), axis = 1)

# Dummy variable for: does the first term match perfectly?

x_train['first_match'] = np.array(x_train.apply(lambda row: re.search('^[^\s]+', \
       row['x_description']).group() == re.search('^[^\s]+', row['y_description']).group(), \
       axis = 1), dtype = 'int')

x_test['first_match'] = np.array(x_test.apply(lambda row: re.search('^[^\s]+', \
      row['x_description']).group() == re.search('^[^\s]+', row['y_description']).group(), \
      axis = 1), dtype = 'int')

### Feature Creation Using fuzzywuzzy package

In [6]:
# Partial ratio

x_train['partial'] = x_train.apply(lambda row: fuzz.partial_ratio(row['x_description'], \
       row['y_description']), axis = 1)

x_test['partial'] = x_test.apply(lambda row: fuzz.partial_ratio(row['x_description'], \
      row['y_description']), axis = 1)

# Partial token set ratio

x_train['ptsr'] = x_train.apply(lambda row: fuzz.partial_token_set_ratio(row['x_description'], \
       row['y_description']), axis = 1)

x_test['ptsr'] = x_test.apply(lambda row: fuzz.partial_token_set_ratio(row['x_description'], \
      row['y_description']), axis = 1)

# Token sort

x_train['token_sort'] = x_train.apply(lambda row: fuzz.token_sort_ratio(row['x_description'], \
       row['y_description']), axis = 1)

x_test['token_sort'] = x_test.apply(lambda row: fuzz.token_sort_ratio(row['x_description'], \
      row['y_description']), axis = 1)

# Token set

x_train['token_set'] = x_train.apply(lambda row: fuzz.token_set_ratio(row['x_description'], \
       row['y_description']), axis = 1)

x_test['token_set'] = x_test.apply(lambda row: fuzz.token_set_ratio(row['x_description'], \
      row['y_description']), axis = 1)

# Ratio

x_train['ratio'] = x_train.apply(lambda row: fuzz.ratio(row['x_description'], \ 
      row['y_description']), axis = 1)

x_test['ratio'] = x_test.apply(lambda row: fuzz.ratio(row['x_description'], \
      row['y_description']), axis = 1)

# Partial ratio for series of first letters in each term

def firstletter(x):
    L = str.split(x)
    newstring = ' '.join([item[:1] for item in L])
    return(newstring)
    
x_train['first_letter'] = x_train.apply(lambda row: \
       fuzz.partial_ratio(firstletter(row['x_description']), firstletter(row['y_description'])), \
       axis = 1)

x_test['first_letter'] = x_test.apply(lambda row: \
      fuzz.partial_ratio(firstletter(row['x_description']), firstletter(row['y_description'])), \
      axis = 1)

# Token set ratio for string with no vowels

def novowels(x):
    return(''.join([l for l in x if l not in ['a','e','i','o','u']]))

x_train['no_vowels'] = x_train.apply(lambda row: fuzz.token_set_ratio(novowels(row['x_description']), \
       novowels(row['y_description'])), axis = 1)

x_test['no_vowels'] = x_test.apply(lambda row: fuzz.token_set_ratio(novowels(row['x_description']), \
      novowels(row['y_description'])), axis = 1)

# Token set ratio for first term

x_train['first_ratio'] = x_train.apply(lambda row: fuzz.token_set_ratio(re.search('^[^\s]+', \
       row['x_description']).group(), re.search('^[^\s]+', row['y_description']).group()), axis = 1)

x_test['first_ratio'] = x_test.apply(lambda row: fuzz.token_set_ratio(re.search('^[^\s]+', \
       row['x_description']).group(), re.search('^[^\s]+', row['y_description']).group()), axis = 1)

# Partial ratio for series of first + last letters in each term

def fl_letter(x):
    L = str.split(x)
    newstring = ' '.join([item[:1]+item[-1:] if len(item) != 1 else item[-1:] for item in L])
    return(newstring)
    
x_train['first_last'] = x_train.apply(lambda row: fuzz.partial_ratio(fl_letter(row['x_description']), \
       fl_letter(row['y_description'])), axis = 1)

x_test['first_last'] = x_test.apply(lambda row: fuzz.partial_ratio(fl_letter(row['x_description']), \
      fl_letter(row['y_description'])), axis = 1)

## Modeling

In [7]:
# Omitting the string descriptions / only using created features

x_train = x_train.iloc[:,2:]

x_test = x_test.iloc[:,2:]

# Fit the model

model = RandomForestClassifier(n_estimators = 400, random_state = 0)

model.fit(x_train, y_train)

# Train score

model.score(x_train, y_train)

0.9999407091189375

## Results

In [9]:
# Train confusion matrix

confusion_matrix(y_train,model.predict(x_train))

array([[10189,     1],
       [    0,  6676]], dtype=int64)

In [10]:
# Test confusion matrix: 43 / 4217 misclassified

confusion_matrix(y_test,model.predict(x_test))

array([[2531,   18],
       [  25, 1643]], dtype=int64)

In [11]:
# Shown as a proportion of total cases

confusion_matrix(y_test,model.predict(x_test))/len(y_test)

array([[0.60018971, 0.00426844],
       [0.00592839, 0.38961347]])

In [16]:
# Output exceptions for further review

miss = np.ravel(np.where(np.array(y_test != model.predict(x_test))))

miss_feat = test.loc[miss,:]

miss_y = y_test.loc[miss]

miss_df = pd.concat([miss_y, miss_feat], axis = 1)

In [27]:
# Cases that were misclassified shown below

miss_df=miss_df.rename(columns = {0:'True Class'})

miss_df

Unnamed: 0,True Class,x_description,x_id,y_description,y_id
75,True,Htfd Intl Sm Co R5,49861,The Hartford International Small Company Fund5,49861
219,True,Cleveland-Cliffs Inc.,42461,CLIFFS NAT RES INC COM,42461
250,True,"A-Power Energy Generation Systems, Ltd.",7537,Astropower Inc No Stockholder Equity 12/27/2004,7537
263,False,Pimco Municipal Income Fund II of Beneficial I...,32482,First Trust MLP and Energy Income Fund of Bene...,20075
285,True,State Street Institutional International Equit...,52230,GE INT'L FUND,52230
359,True,WF Fund Admin Class,23394,Wells Fargo Common Stk Adm CL,23394
396,True,Wells Fargo Target 2050 Fund Admin Class,40076,Vaneck Merk Gold Trust,40076
453,False,Neuberger Berman Global Allocation Fd Cl C,16255,Neuberger Berman Large Cap Value Fd Inst Cl,13320
472,False,Wells Fargo California Limited Term Tax-Free Fund,16674,Wells Fargo Advantage Enterprise Fund Investor...,52342
789,True,Westinghouse Air Brake Technologies Corporation,35369,WABTEC,35369
