In [1]:
__author__      = 'Yu Cao'

import pandas as pd
import numpy as np
import collections
from string import punctuation as Punct
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
def loadCorpus(corpus_path):
    df = pd.read_csv(corpus_path)
    df = df.fillna('')
    df['count'] = pd.Series(
        [ rawToCount(x) for x in df['pairs_diff'] ], 
        index = df.index
    )
    return shuffle(df)

In [3]:
def isPunct(s):
    return s in Punct or s in ["``", "''"]

def rawToCount(pairs_diff):
    pairs_diff = pairs_diff.replace('@', '')
    tripleList = pairs_diff.split('\t')
    if len(tripleList) < 3: return None

    count = {}
    for triple in tripleList[:-1]:
        triple = triple.split(',')
        if len(triple) != 3 or isPunct(triple[0]) or isPunct(triple[1]): continue
        
        key = '_'.join(triple[:2])
        if (not key in count) or (count[key] < float(triple[2])):
            count[key] = float(triple[2])
        
    return count

In [4]:
def vocabBuild(countList, freqThreshold=20):
    vocab = collections.Counter()
    for item in countList: 
        if item is not None:
            vocab.update(item.keys())

    vocab = vocab.most_common()
    for c, item in enumerate(vocab):
        if item[1] < freqThreshold: break
    
    return [ x[0] for x in vocab[:c] ]

In [5]:
def embed(vocab, df):
    def process(count):
        if count is None: return None
        vector = np.empty(lenVocab)
        vector.fill(-1.)
        for key in count.keys(): 
            if key in vocab:
                 vector[vocab.index(key)] = count[key]
        
        return vector

    lenVocab = len(vocab)
    df['vector'] = pd.Series(
        [ process(x) for x in df['count'] ],
        index=df.index
    )

In [6]:
def removeNone(vectorDF, labelDF):
    tlist = [(v, l) for (v, l) in zip(vectorDF, labelDF) 
    if v is not None]
    return [v for (v, l) in tlist], [l for (v, l) in tlist]

def getBowVectorizer(df, smallerDim=100):
    vectorizer = CountVectorizer()
    dimReducer = PCA(n_components=smallerDim)
    _, corpus = removeNone(df['vector'], df['text'])
    vectorizer.fit(corpus)
    dimReducer.fit(vectorizer.transform(corpus).toarray())
    return vectorizer, dimReducer

def getDimReducer(df, smallerDim=100):
    dimReducer = PCA(n_components=smallerDim)
    X, _ = removeNone(df['vector'], df['label'])
    return dimReducer.fit(X)

def bowToArray(df, bowVectorizer, dimReducer):
    _, corpus = removeNone(df['vector'], df['text'])
    X_bow = bowVectorizer.transform(corpus)
    if dimReducer is not None:
        return dimReducer.transform(X_bow.toarray())
    else:
        return X_bow.toarray()

def getPredictorTarget(df, bowVectorizer=None, dimReducer=None):
    if bowVectorizer is None:
        X, y = removeNone(df['vector'], df['label'])
        if dimReducer is not None:
            X = dimReducer.transform(X)
    else:
        X = bowToArray(df, bowVectorizer, dimReducer)
        _, y = removeNone(df['vector'], df['label'])
    
    return X, y

In [7]:
def modelTrain(df, bowVectorizer=None, dimReducer=None):
    modelList = [
        GaussianNB(),
        SVC(),
        DecisionTreeClassifier(),
        RandomForestClassifier()
    ]
    X, y = getPredictorTarget(df, bowVectorizer, dimReducer)
    for model in modelList:
        model.fit(X, y)
    return modelList

In [8]:
def evalModels(df, modelList, bowVectorizer=None, dimReducer=None):
    X, y = getPredictorTarget(df, bowVectorizer, dimReducer)
    for model in modelList:
        y_pred = model.predict(X)
        print(
            '\n*** %s ***\n' % str(model)[:str(model).find('(')],
            'Prec ' + str(precision_score(y, y_pred)),
            'Reca ' + str(recall_score(y, y_pred)),
            'F1 ' + str(f1_score(y, y_pred)),
        )

In [9]:
'''
main
'''
trainSet = loadCorpus('train.csv')
testSet = loadCorpus('test.csv')
vocab = vocabBuild(trainSet['count'])

embed(vocab, trainSet)
embed(vocab, testSet)
modelList = modelTrain(trainSet, dimReducer=None)
evalModels(testSet, modelList, dimReducer=None)


*** GaussianNB ***
 Prec 0.5315656565656566 Reca 0.7581032412965186 F1 0.6249381494309749

*** SVC ***
 Prec 0.6138461538461538 Reca 0.4789915966386555 F1 0.5380984490896831

*** DecisionTreeClassifier ***
 Prec 0.5482695810564663 Reca 0.542016806722689 F1 0.5451252641110775

*** RandomForestClassifier ***
 Prec 0.5601131541725601 Reca 0.47539015606242496 F1 0.5142857142857143


In [10]:
dim_reducer = getDimReducer(trainSet)
modelList = modelTrain(trainSet, bowVectorizer=None, dimReducer=dim_reducer)
evalModels(testSet, modelList, bowVectorizer=None, dimReducer=dim_reducer)


*** GaussianNB ***
 Prec 0.5784708249496981 Reca 0.34513805522208885 F1 0.4323308270676691

*** SVC ***
 Prec 0.5946488294314382 Reca 0.5336134453781513 F1 0.5624802277760202

*** DecisionTreeClassifier ***
 Prec 0.5214541120381406 Reca 0.5252100840336135 F1 0.5233253588516746

*** RandomForestClassifier ***
 Prec 0.5652173913043478 Reca 0.4447779111644658 F1 0.4978165938864629


In [11]:
bowVectorizer, dimReducer = getBowVectorizer(trainSet)
modelList = modelTrain(trainSet, bowVectorizer, dimReducer)
evalModels(testSet, modelList, bowVectorizer, dimReducer)


*** GaussianNB ***
 Prec 0.5592689295039165 Reca 0.6428571428571429 F1 0.5981569394024016

*** SVC ***
 Prec 0.6736474694589878 Reca 0.46338535414165666 F1 0.5490753911806544

*** DecisionTreeClassifier ***
 Prec 0.5532811559301626 Reca 0.5516206482593037 F1 0.5524496543432522

*** RandomForestClassifier ***
 Prec 0.5823442136498517 Reca 0.47118847539015607 F1 0.5209024552090246
