In [1]:
__author__      = 'Yu Cao'

import pandas as pd
import numpy as np
import collections
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score


In [2]:
def loadCorpus(corpus_path):
    df = pd.read_csv(corpus_path)
    df = df.fillna('')
    df['count'] = pd.Series(
        [ rawToCount(x) for x in df['pairs_diff'] ], 
        index = df.index
    )
    return shuffle(df)

In [3]:
def rawToCount(pairs_diff):
    pairs_diff = pairs_diff.replace('@', '')
    tripleList = pairs_diff.split('\t')
    if len(tripleList) < 3: return None

    count = {}
    for triple in tripleList[:-1]:
        triple = triple.replace(',', 'я')
        triple = triple.replace('яя', ',я')
        triple = triple.split('я')
        key = '_'.join(triple[:2])
        if (not key in count) or (count[key] < float(triple[2])):
            count[key] = float(triple[2])
        
    return count

In [4]:
def vocabBuild(countList, freqThreshold=20):
    vocab = collections.Counter()
    for item in countList: 
        if item is not None:
            vocab.update(item.keys())

    vocab = vocab.most_common()
    for c, item in enumerate(vocab):
        if item[1] < freqThreshold: break
    
    return [ x[0] for x in vocab[:c] ]

In [5]:
def embed(vocab, df):
    def process(count):
        if count is None: return None
        vector = np.empty(lenVocab)
        vector.fill(-100.)
        for key in count.keys(): 
            if key in vocab:
                 vector[vocab.index(key)] = count[key]
        
        return vector

    lenVocab = len(vocab)
    df['vector'] = pd.Series(
        [ process(x) for x in df['count'] ],
        index=df.index
    )

In [6]:
def modelTrain(df):
    modelList = [
        GaussianNB(),
        SVC(),
        tree.DecisionTreeClassifier(),
        RandomForestClassifier()
    ]
    tlist = [(v, l) for (v, l) in zip(df['vector'], df['label']) 
    if v is not None]
    X = [v for (v, l) in tlist]
    y = [l for (v, l) in tlist]

    for model in modelList:
        model.fit(X, y)
    
    return modelList

In [7]:
def evalModels(df, modelList):
    tlist = [(v, l) for (v, l) in zip(df['vector'], df['label']) 
    if v is not None]
    X = [v for (v, l) in tlist]
    y = [l for (v, l) in tlist]

    for model in modelList:
        y_pred = model.predict(X)
        print(
            '\n*** %s ***\n' % str(model)[:str(model).find('(')],
            'Prec ' + str(precision_score(y, y_pred)),
            'Reca ' + str(recall_score(y, y_pred)),
            'F1 ' + str(f1_score(y, y_pred)),
        )

In [8]:
'''
main
'''
trainSet = loadCorpus('train.csv')
testSet = loadCorpus('test.csv')
vocab = vocabBuild(trainSet['count'])

embed(vocab, trainSet)
embed(vocab, testSet)
modelList = modelTrain(trainSet)
evalModels(testSet, modelList)


*** GaussianNB ***
 Prec 0.5374094931617055 Reca 0.801920768307323 F1 0.6435452793834295

*** SVC ***
 Prec 0.5108418367346939 Reca 0.9615846338535414 F1 0.6672219908371512

*** DecisionTreeClassifier ***
 Prec 0.5423623995052567 Reca 0.5264105642256903 F1 0.534267438318611

*** RandomForestClassifier ***
 Prec 0.5625909752547307 Reca 0.4639855942376951 F1 0.5085526315789474
