In [8]:
from util import load_data, process_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
import numpy as np
import nltk
import string
import os
from nltk.stem.porter import PorterStemmer

In [9]:
def avg(l):
    return sum(l) * 1.0 / len(l)

def prior(l, label):
    return sum(map(lambda x: x[1] == label, l)) * 1.0 / len(l)

In [10]:
def cross_validation_decomp(data, group_size = 10):
    for i in range(group_size):
        sub_size = len(data) / group_size
        first_half = data[:(i-1)*sub_size] if i > 0 else []
        second_half = data[(i+1)*sub_size:] if i < group_size - 1 else []
        train_data = first_half + second_half
        test_data = data[i*sub_size:(i+1)*sub_size]
        yield (train_data, test_data)

In [11]:
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

def run_classifer(classifier, use_stemmer=False, dimension_reduction=False):
    data = load_data()
    for key in data:

        accuracies = []
        sub_acc = defaultdict(list)
        sub_recall = defaultdict(list)
        category = set([tweet[1] for tweet in data[key]])

        for train_data, test_data in cross_validation_decomp(data[key]):
            
            if use_stemmer:
                training_text = [process_text(tweet[0]) for tweet in train_data]
                testing_text = [process_text(tweet[0]) for tweet in test_data]
            else:
                training_text = [tweet[0] for tweet in train_data]
                testing_text = [tweet[0] for tweet in test_data]

            training_label = [tweet[1] for tweet in train_data]
            testing_label = [tweet[1] for tweet in test_data]

            if use_stemmer:
                vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1,3))
            else:
                vectorizer = TfidfVectorizer(ngram_range=(1,3))
            training_matrix = vectorizer.fit_transform(training_text)
            testing_matrix = vectorizer.transform(testing_text)


            weight_mode = None
            for label in category:
                if prior(train_data,label) < 0.1:
                    weight_mode = 'balanced'
                    break

            if dimension_reduction: 
                dimension_reduction = TruncatedSVD(n_components = 1000)
                training_matrix = dimension_reduction.fit_transform(training_matrix)
                testing_matrix = dimension_reduction.transform(testing_matrix)
            
            if classifier == 'LinearSVC':
                clf = LinearSVC(class_weight = weight_mode)
            elif classifier == 'RandomForest':
                clf = RandomForestClassifier(n_estimators=10)
            elif classifier == 'AdaBoostClassifier':
                clf = AdaBoostClassifier(n_estimators=100)
            elif classifier == 'BaggingClassifier':
                clf = BaggingClassifier(LinearSVC(),max_samples=0.5, max_features=0.7)

            clf.fit(training_matrix, training_label)
            prediction = clf.predict(testing_matrix)
            
            for label in category:
                sub_acc[label].append(sum([a[0] == a[1] and a[0] == label for a in zip(testing_label, prediction)]) * 1.0 / (sum([a == label for a in testing_label]) + 1))
                sub_recall[label].append(sum([a[0] == a[1] and a[1] == label for a in zip(testing_label, prediction)])* 1.0 / (sum([a == label for a in prediction]) + 1))
            accuracies.append(clf.score(testing_matrix,testing_label))
        
        print("===================")
        print(classifier)
        print(key+':')
        print("label     prior     accuracy  recall")
        for label in category:
            print("%-9s%1.7f  %1.7f  %1.7f" %(label, prior(data[key],label),avg(sub_acc[label]), avg(sub_recall[label])))

        print('overall accuracy:' + str(avg(accuracies)))
        print('\n\n')


In [None]:
run_classifer('LinearSVC', use_stemmer=True)

LinearSVC
Misc__other_plea_for_action:
label     prior     accuracy  recall
1        0.0549339  0.4576497  0.5656736
0        0.9450661  0.9817458  0.9676040
overall accuracy:0.955804480652



LinearSVC
Makes_a_Factual_or_Verifiable_Claim:
label     prior     accuracy  recall
1        0.2945793  0.3571564  0.5915518
0        0.7054207  0.9053897  0.7649735
overall accuracy:0.739557739558



LinearSVC
Sentiment____1_negative__0_neutral__1_positive_:
label     prior     accuracy  recall
1        0.3569109  0.7260078  0.6636059
0        0.3084416  0.4235621  0.5505926
-1       0.3346475  0.7726313  0.6823036
overall accuracy:0.645939675174



LinearSVC
Asks_for_Donation_Asks_you_to_buy_something_to_support_campaign:
label     prior     accuracy  recall
1        0.0029396  0.1166667  0.1500000
0        0.9970604  0.9979759  0.9957775
overall accuracy:0.997426470588



LinearSVC
Policy:
label     prior     accuracy  recall
0        0.7228217  0.9500935  0.8637899
1        0.0705436  0.66858

In [None]:
run_classifer('RandomForest')

In [None]:
run_classifer('AdaBoostClassifier')

In [None]:
run_classifer('BaggingClassifier')

In [None]:
run_classifer('LinearSVC', use_stemmer=True)