In [81]:
import xml.etree.ElementTree as ET
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from matplotlib import pyplot as plt

data = []
target = []
test_data = []
test_target = []
categories = ["current", "past", "non", "unknown"]

In [82]:
def parsetrain(filename):
    tree = ET.parse(filename)
    #this filename can be changed to accommodate different smoker files
    root = tree.getroot()

    for element in root.findall('RECORD'):
        text = str(element.find('TEXT').text)[50:]
        data.append(text)
        s= str(element.find("SMOKING").attrib['STATUS'])
        if "UNK" in s:
            target.append(4)
        elif "NON" in s:
            target.append(3)
        elif "PAST" in s:
            target.append(2)
        else:
            target.append(1)
            
def parsetest(filename):
    tree = ET.parse(filename)
    #this filename can be changed to accommodate different smoker files
    root = tree.getroot()

    for element in root.findall('RECORD'):
        text = str(element.find('TEXT').text)[50:]
        test_data.append(text)
        s= str(element.find("SMOKING").attrib['STATUS'])
        if "UNK" in s:
            test_target.append(4)
        elif "NON" in s:
            test_target.append(3)
        elif "PAST" in s:
            test_target.append(2)
        else:
            test_target.append(1)

In [83]:
parsetrain('smokers_surrogate_train_all_version2.xml')
parsetest('smokers_surrogate_test_all_groundtruth_version2.xml')

In [91]:
#TFIDF feature extraction - will be used for the rest of the tests
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, target)

In [85]:
X_new_counts = count_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
print(np.mean(predicted == test_target))

0.605769230769


In [86]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
_ = text_clf.fit(data, target)
predicted = text_clf.predict(test_data)
print(np.mean(predicted == test_target))
#print(predicted)

0.605769230769


In [87]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()), 
                     ('clf', SGDClassifier(loss='log', penalty='elasticnet', alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf_svm.fit(data, target)
predicted = text_clf_svm.predict(test_data)
print(np.mean(predicted == test_target))
#print(predicted)

0.625


In [88]:
text_clf_rdf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()), 
                         ('clf', RandomForestClassifier(n_estimators=10, 
                                                        criterion = "entropy")),
])
_ = text_clf_rdf.fit(data, target)
predicted = text_clf_rdf.predict(test_data)
print(np.mean(predicted == test_target))
#print(predicted)

0.634615384615


In [89]:
text_clf_dct = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()), 
                         ('clf', tree.DecisionTreeClassifier()),
])
_ = text_clf_dct.fit(data, target)
predicted = text_clf_dct.predict(test_data)
print(np.mean(predicted == test_target))
#print(predicted)

0.740384615385


In [None]:
#conclusion - tf-idf feature extraction doesn't really work - move back to n-gram