In [1]:
from nltk.classify import NaiveBayesClassifier
train = [
    (dict(a=1,b=1,c=1), 'y'),
    (dict(a=1,b=1,c=1), 'x'),
    (dict(a=1,b=1,c=0), 'y'),
    (dict(a=0,b=1,c=1), 'x'),
    (dict(a=0,b=1,c=1), 'y'),
    (dict(a=0,b=0,c=1), 'y'),
    (dict(a=0,b=1,c=0), 'x'),
    (dict(a=0,b=0,c=0), 'x'),
    (dict(a=0,b=1,c=1), 'y'),
]
test=[
    (dict(a=1,b=0,c=1)),
    (dict(a=1,b=0,c=0)),
    (dict(a=0,b=1,c=1)),
    (dict(a=0,b=1,c=0)),
]
classifier = NaiveBayesClassifier.train(train)
labels = classifier.classify_many(test)
print(labels)

probs = classifier.prob_classify_many(test)
for pdist in probs:
    print("%.4f %.4f" % (pdist.prob('x'), pdist.prob('y')))


classifier.show_most_informative_features()

['y', 'x', 'y', 'x']
0.3203 0.6797
0.5857 0.4143
0.3792 0.6208
0.6470 0.3530
Most Informative Features
                       c = 0                   x : y      =      2.0 : 1.0
                       c = 1                   y : x      =      1.5 : 1.0
                       a = 1                   y : x      =      1.4 : 1.0
                       b = 0                   x : y      =      1.2 : 1.0
                       a = 0                   x : y      =      1.2 : 1.0
                       b = 1                   y : x      =      1.1 : 1.0


In [2]:
def build_voc(data):
    voc = {}
    for (sentence,val) in data:
        words = sentence.lower().split()
        for w in words:
            voc[w] = True
    return voc


def feature(data,v):
    ftr = []
    for (sentence,label) in data:
        f = dict((w,0) for w in v.keys() )
        words = sentence.lower().split()
        for w in words:
            f[w]=1
        ftr.append((f,label))
    return ftr

def classify_eval(truth,pred):
    idx = 0
    (TP, FP, TN, FN) = (0, 0, 0, 0)
    for truth_label in truth:
        pred_label = pred[idx]
        if( truth_label == 1 and pred_label == 1 ):
            TP = TP + 1
        elif( truth_label == 0 and pred_label == 0 ):
            TN = TN +1
        elif( truth_label == 1 and pred_label == 0 ):
            FN = FN + 1
        elif( truth_label == 0 and pred_label == 1 ):
            FP = FP + 1
        idx = idx + 1
    P = 0 if TP == 0 else TP / (TP + FP)
    R = 0 if TP == 0 else TP / (TP + FN)
    F = 0 if (P == 0 or R == 0) else 2* P *R/(P + R)
    Acc = 0 if (TP + TN == 0) else (TP + TN)/(TP + TN + FP + FN)
    return (P,R,F,Acc)



In [3]:




train_corpus = [("The team dominiated the game", True),
                ("The game was intense", True),
                ("The ball went off the court", True),
                ("They had the ball for the whole game", True),
                ("The President did not comment", False),
                ("The show is over", False),
               ]
v = build_voc(train_corpus)

train_feature_label = feature(train_corpus, v)
NBC = NaiveBayesClassifier.train(train_feature_label)

test_corpus = [("I lost the keys", False),
                ("The goalkeeper catched the ball", True),
                ("The other team controlled the ball", True),
                ("Sara has two kids", False),
                ("This is a book", True),
               ]
test_feature = []
test_labels = []
for (ftr, label) in feature(test_corpus, v):
    test_feature.append(ftr)
    test_labels.append(label)
    
pred_labels = NBC.classify_many(test_feature)
perf = classify_eval(test_labels, pred_labels)

print(pred_labels)
print(test_labels)

print("Precision = %.4f, Recall = %.4f, F-score = %.4f, Accuracy = %.4f " % 
      classify_eval(test_labels, pred_labels))

probs = NBC.prob_classify_many(test_feature)
for pdist in probs:
    print("%.4f %.4f" % (pdist.prob(True), pdist.prob(False)))


NBC.show_most_informative_features(10)


[True, True, True, True, False]
[False, True, True, False, True]
Precision = 0.5000, Recall = 0.6667, F-score = 0.5714, Accuracy = 0.4000 
0.7776 0.2224
0.9459 0.0541
0.9740 0.0260
0.6602 0.3398
0.1775 0.8225
Most Informative Features
                    game = 0               False : True   =      2.8 : 1.0
                 comment = 0                True : False  =      1.8 : 1.0
                     did = 0                True : False  =      1.8 : 1.0
                      is = 0                True : False  =      1.8 : 1.0
                     not = 0                True : False  =      1.8 : 1.0
                    over = 0                True : False  =      1.8 : 1.0
               president = 0                True : False  =      1.8 : 1.0
                    show = 0                True : False  =      1.8 : 1.0
                    ball = 0               False : True   =      1.7 : 1.0
                   court = 0               False : True   =      1.2 : 1.0


In [4]:
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier

SVMC= SklearnClassifier(LinearSVC())
SVMC.train(train_feature_label)
pred_labels_SVM=[]
for f in test_feature:
    pred_labels_SVM.append(SVMC.classify(f))
    
print(pred_labels_SVM)
print("Precision = %.4f, Recall = %.4f, F-score = %.4f, Accuracy = %.4f " % 
      classify_eval(test_labels, pred_labels_SVM))


[True, True, True, True, False]
Precision = 0.5000, Recall = 0.6667, F-score = 0.5714, Accuracy = 0.4000 
