In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from nltk.stem.porter import PorterStemmer
import nltk
import pandas as pd
import numpy as np
# import re
import matplotlib.pylab as plt



In [2]:
## functions for lexical analysis

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens,stemmer)
    return stems

def text2vec(full_text, use_stemmer=False):
    """
    Convert text to vectors using TFIDF
    ngram_range=(1,3) means unigrams, bigrams and trigrams; 
    if want to use bigrams only, define ngram_range=(2,2)
    
    """
    text=full_text[:]
    if use_stemmer:
        vectorizer = TfidfVectorizer(tokenizer=tokenize,ngram_range=(1,3))
    else:
        vectorizer = TfidfVectorizer(ngram_range=(1,3))
    text_vector = vectorizer.fit_transform(text)
    return text_vector

# Step 1: Load the Data
Load "human-coded-tweets" into a pandas dataframe. Data preprocessing (convert to lower case, remove punctuation, remove screen names. etc) should be done before this step.

In [25]:
stemmer = PorterStemmer()
data = pd.DataFrame.from_csv('ProcessedDataLowNoLinkNoPuncNoNames.csv')
# take the 'text' column from the dataframe, and 
# convert the text to vectors
full_text = data['text']
text_vec = text2vec(full_text, use_stemmer=True)

ValueError: np.nan is an invalid document, expected byte or unicode string.

# Step 2: Split the data
Split the data into a training set and a test set. Ideally the data should be shuffled before the split to avoid implicit bias in the dataset.

In [23]:
pred_train = text_vec[0:7638,] #7638 training tweets
pred_test = text_vec[7638:,] # 725 test tweets
pred_matrix = np.zeros((725,18))

index
1.000000e+00    the sheer size and remoteness of the federal b...
2.000000e+00    rt pilotnews ex va gov jim gilmore not giving ...
3.000000e+00                                listen here cruzcrew 
4.000000e+00    rt freebeacon tedcruz on jailing of christian ...
5.000000e+00    just one candidate is in a strong position to ...
6.000000e+00               champions welcome to nyc ussoccer wnt 
7.000000e+00    rt reprodblum today i voted no on hr2048 the u...
8.000000e+00    thanks for joining cruzcountry see y all again...
9.000000e+00    hey cpac john kasich s plan returns power mone...
1.000000e+01    rt thebriefing2016 you re right johnkasich the...
1.100000e+01    rt leedanielsent imwithher because hillaryclin...
1.200000e+01    rt tomnocera the reagan trump images are inspi...
1.300000e+01    humbled amp honored by the overflow crowds the...
1.400000e+01    rt ie4bernie one hour until stay and this hous...
1.500000e+01    this account will be run by campaign staff fro...
1.60

# Step 3: Pick the right models
The classification pipeline is: fitting the model with the training set -> predict labels on the test set -> compare predicted labels to real labels(human-coded-labels). These are all done on the human-coded-tweets, for the purpose of finding the best classification model with appropriate hyper-parameter settings. 

These could be done in one big for-loop, but it takes a long time to run, and it did crash my computer several times. What I did was to run classifications on only a few categories at a time - there would be many repeated code.


- Sentiment (Multinomial Naive Bayes); different alpha values would yield slightly different classification results

In [24]:
key = 'Sentiment'
alpha = 0.4

# real labels for the training set
tar_train = data[key][0:7638,] 
# real labels for the test set
tar_test = data[key][7638:,]

# specify the classification model
clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
# fit the model with the training set
clf.fit(pred_train, tar_train)
# compute training accuracy
train_score = clf.score(pred_train, tar_train)
# predict labels on the test set
y_pred = clf.predict(pred_test)

# compute standard metrics
test_accuracy = metrics.accuracy_score(tar_test, y_pred)
class_report = metrics.classification_report(tar_test, y_pred)
kappa = metrics.cohen_kappa_score(tar_test, y_pred)

print(key)
print('='*50)
print('Training Accuracy: '+'{:.4f}'.format(train_score))
print('Test Accuracy: '+'{:.4f}'.format(test_accuracy))
print('Kappa: '+'{:.4f}'.format(kappa))
print(class_report)
print('\n')

ValueError: Can't handle mix of continuous and binary

- Political & Makes_a_Factual_or_Verifiable_Claim (Linear SVC)

In [6]:
keys = ['Political', 'Makes_a_Factual_or_Verifiable_Claim']

for key in keys: 
    # real labels for the training set
    tar_train = data[key][0:6800,] 
    # real labels for the test set
    tar_test = data[key][6800:,]

    # specify the classification model
    clf = LinearSVC(class_weight='balanced')
    # fit the model with the training set
    clf.fit(pred_train, tar_train)
    # compute training accuracy
    train_score = clf.score(pred_train, tar_train)
    # predict labels on the test set
    y_pred = clf.predict(pred_test)

    # compute standard metrics
    test_accuracy = metrics.accuracy_score(tar_test, y_pred)
    class_report = metrics.classification_report(tar_test, y_pred)
    kappa = metrics.cohen_kappa_score(tar_test, y_pred)

    print(key)
    print('='*50)
    print('Training Accuracy: '+'{:.4f}'.format(train_score))
    print('Test Accuracy: '+'{:.4f}'.format(test_accuracy))
    print('Kappa: '+'{:.4f}'.format(kappa))
    print(class_report)
    print('\n')

Political
Training Accuracy: 0.9985
Test Accuracy: 0.8662
Kappa: 0.1062
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         1
          0       0.47      0.08      0.14        95
          1       0.88      0.99      0.93       629

avg / total       0.82      0.87      0.82       725



Makes_a_Factual_or_Verifiable_Claim
Training Accuracy: 0.9960
Test Accuracy: 0.7310
Kappa: 0.3698
             precision    recall  f1-score   support

          0       0.76      0.86      0.81       474
          1       0.65      0.49      0.56       251

avg / total       0.72      0.73      0.72       725





  'precision', 'predicted', average, warn_for)


- Ideology & Immigration & Macroeconomic & National_Security & 
Crime & Civil_Rights & Environment & Education & Health_Care (Bagging Classifier)

In [40]:
max_samples=0.7
max_features=0.8
keys = ['Ideology', 'Immigration', 'Macroeconomic', 'National_Security', 'Crime', 'Civil_Rights',
        'Environment', 'Education']

for key in keys: 
    # real labels for the training set
    tar_train = data[key][0:6800,] 
    # real labels for the test set
    tar_test = data[key][6800:,]

    # specify the classification model
    clf = BaggingClassifier(LinearSVC(class_weight='balanced'), 
                            max_samples=max_samples, max_features=max_features)
    # fit the model with the training set
    clf.fit(pred_train, tar_train) 
    # compute training accuracy
    train_score = clf.score(pred_train, tar_train)
    # predict labels on the test set
    y_pred = clf.predict(pred_test)

    # compute standard metrics
    test_accuracy = metrics.accuracy_score(tar_test, y_pred)
    class_report = metrics.classification_report(tar_test, y_pred)
    kappa = metrics.cohen_kappa_score(tar_test, y_pred)

    print(key)
    print('='*50)
    print('Training Accuracy: '+'{:.4f}'.format(train_score))
    print('Test Accuracy: '+'{:.4f}'.format(test_accuracy))
    print('Kappa: '+'{:.4f}'.format(kappa))
    print(class_report)
    print('\n')

Ideology
Training Accuracy: 0.9971
Test Accuracy: 0.6510
Kappa: 0.4446
             precision    recall  f1-score   support

         -1       0.73      0.59      0.65       144
          0       0.52      0.63      0.57       238
          1       0.74      0.69      0.71       343

avg / total       0.66      0.65      0.65       725



Immigration
Training Accuracy: 0.9988
Test Accuracy: 0.9807
Kappa: 0.3578
             precision    recall  f1-score   support

          0       0.98      1.00      0.99       707
          1       1.00      0.22      0.36        18

avg / total       0.98      0.98      0.97       725



Macroeconomic
Training Accuracy: 0.9993
Test Accuracy: 0.9434
Kappa: 0.5436
             precision    recall  f1-score   support

          0       0.94      1.00      0.97       658
          1       0.96      0.40      0.57        67

avg / total       0.94      0.94      0.93       725



National_Security
Training Accuracy: 0.9991
Test Accuracy: 0.9200
Kappa: 0.

  'precision', 'predicted', average, warn_for)


Crime
Training Accuracy: 0.9997
Test Accuracy: 0.9710
Kappa: 0.0000
             precision    recall  f1-score   support

          0       0.97      1.00      0.99       704
          1       0.00      0.00      0.00        21

avg / total       0.94      0.97      0.96       725



Civil_Rights
Training Accuracy: 0.9996
Test Accuracy: 0.9655
Kappa: 0.1315
             precision    recall  f1-score   support

          0       0.97      1.00      0.98       699
          1       0.67      0.08      0.14        26

avg / total       0.96      0.97      0.95       725



Environment
Training Accuracy: 1.0000
Test Accuracy: 0.9917
Kappa: 0.2482
             precision    recall  f1-score   support

          0       0.99      1.00      1.00       718
          1       1.00      0.14      0.25         7

avg / total       0.99      0.99      0.99       725



Education
Training Accuracy: 0.9997
Test Accuracy: 0.9917
Kappa: 0.2469
             precision    recall  f1-score   support

      

- Governance & No_Policy_Content & Asks_for_Donation &  Asks_you_to_watch_something_share_something_follow_something & Misc & Expresses_an_Opinion (Bagging Classifier)

In [38]:
max_samples=0.7
max_features=0.8
keys = ['Governance', 'No_Policy_Content', 'Asks_for_Donation', 
        'Asks_you_to_watch_something_share_something_follow_something',
        'Misc', 'Expresses_an_Opinion']

for key in keys: 
    # real labels for the training set
    tar_train = data[key][0:6800,] 
    # real labels for the test set
    tar_test = data[key][6800:,]

    # specify the classification model
    clf = BaggingClassifier(LinearSVC(class_weight='balanced'),
                            max_samples=max_samples, max_features=max_features)
    # fit the model with the training set
    clf.fit(pred_train, tar_train)
    # compute training accuracy
    train_score = clf.score(pred_train, tar_train)
    # predict labels on the test set
    y_pred = clf.predict(pred_test)

    # compute standard metrics
    test_accuracy = metrics.accuracy_score(tar_test, y_pred)
    class_report = metrics.classification_report(tar_test, y_pred)
    kappa = metrics.cohen_kappa_score(tar_test, y_pred)

    print(key)
    print('='*50)
    print('Training Accuracy: '+'{:.4f}'.format(train_score))
    print('Test Accuracy: '+'{:.4f}'.format(test_accuracy))
    print('Kappa: '+'{:.4f}'.format(kappa))
    print(class_report)
    print('\n')

  'precision', 'predicted', average, warn_for)


Governance
Training Accuracy: 0.9994
Test Accuracy: 0.9697
Kappa: 0.0000
             precision    recall  f1-score   support

          0       0.97      1.00      0.98       703
          1       0.00      0.00      0.00        22

avg / total       0.94      0.97      0.95       725



No_Policy_Content
Training Accuracy: 0.9968
Test Accuracy: 0.7034
Kappa: 0.3987
             precision    recall  f1-score   support

          0       0.90      0.44      0.59       353
          1       0.64      0.95      0.77       372

avg / total       0.77      0.70      0.68       725



Asks_for_Donation
Training Accuracy: 0.9997
Test Accuracy: 0.9848
Kappa: 0.2631
             precision    recall  f1-score   support

          0       0.98      1.00      0.99       712
          1       1.00      0.15      0.27        13

avg / total       0.99      0.98      0.98       725



Asks_you_to_watch_something_share_something_follow_something
Training Accuracy: 0.9987
Test Accuracy: 0.9366
Kappa: 

# Step Four: Final Classifications
Having the appropriate models and hyper-parameter settings, we can use the models to predict labels for the entire corpus of tweets. At this step, we take all 7525 human-coded-tweets as the training set, and the whole corpus as the test set.

In [7]:
# load training data and convert to a vector
train_data = pd.DataFrame.from_csv('ProcessedDataLowNoLinkNoPuncNoNames.csv')
train_text = train_data['text']
vectorizer = TfidfVectorizer(tokenizer=tokenize,ngram_range=(1,3))
pred_train = vectorizer.fit_transform(train_text)

In [8]:
# load the full corpus and convert to a vector
final_data = pd.DataFrame.from_csv('ProcessedFullCorpus.csv')
final_text = final_data['text']
pred_final = vectorizer.transform(final_text.values.astype('U'))

In [9]:
# specify classification models and hyper-parameter settings
alpha = 0.1
max_samples=0.7
max_features=0.8

def runClassifiers(classifier, pred_train, tar_train, pred_final):
    if classifier=='NB':
        clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
    elif classifier=='SVC':
        clf = LinearSVC(class_weight='balanced')
    elif classifier=='Bag':
        clf = BaggingClassifier(LinearSVC(class_weight='balanced'),
                                max_samples=0.7, max_features=0.8)
    clf.fit(pred_train,tar_train)
    print('fit successfully')
    y_class_final = clf.predict(pred_final)
    print('predict successfully')
    return y_class_final

# A list hardcoded to specify the classifier to use for each category
classifiers = ['NB','SVC','Bag','Bag','Bag','Bag','Bag','Bag','Bag',
               'Bag','Bag','Bag','Bag','Bag','Bag','Bag','SVC','Bag']

In [10]:
# obtain column names
keys = [key for key in train_data if key != 'text']
pred_matrix = np.zeros((len(final_data),18))

In [11]:
# final classifications: save prediction results in a matrix
for n, key in enumerate(keys):
    tar_train = train_data[key]
    y_final = runClassifiers(classifiers[n], pred_train, tar_train, pred_final)
    print(key+' Classifications Done')
    print('-'*50)
    pred_matrix[:,n] = y_final

fit successfully
predict successfully
Sentiment Classifications Done
--------------------------------------------------
fit successfully
predict successfully
Political Classifications Done
--------------------------------------------------
fit successfully
predict successfully
Ideology Classifications Done
--------------------------------------------------
fit successfully
predict successfully
Immigration Classifications Done
--------------------------------------------------
fit successfully
predict successfully
Macroeconomic Classifications Done
--------------------------------------------------
fit successfully
predict successfully
National_Security Classifications Done
--------------------------------------------------
fit successfully
predict successfully
Crime Classifications Done
--------------------------------------------------
fit successfully
predict successfully
Civil_Rights Classifications Done
--------------------------------------------------
fit successfully
predict suc

In [12]:
# convert prediction matrix into dataframe
final_df = pd.DataFrame(pred_matrix).astype(int)
final_df.columns = keys

In [13]:
final_df.sample(5)

Unnamed: 0,Sentiment,Political,Ideology,Immigration,Macroeconomic,National_Security,Crime,Civil_Rights,Environment,Education,Health_Care,Governance,No_Policy_Content,Asks_for_Donation,Asks_you_to_watch_something_share_something_follow_something,Misc,Makes_a_Factual_or_Verifiable_Claim,Expresses_an_Opinion
8282,0,1,-1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
19782,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
14342,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
8769,-1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10674,1,1,-1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0


In [15]:

final_file = pd.concat([final_data.reset_index(drop=True), final_df], axis=1)

In [17]:
# save prediction result to csv
out_file = 'finalPrediction.csv'
final_file.to_csv(out_file)

In [18]:
list(final_file)

['X',
 'id',
 'created_at',
 'text',
 'hashtag.',
 'at.',
 'link',
 'retweets',
 'favorites',
 'full.URL',
 'Name',
 'Sentiment',
 'Political',
 'Ideology',
 'Immigration',
 'Macroeconomic',
 'National_Security',
 'Crime',
 'Civil_Rights',
 'Environment',
 'Education',
 'Health_Care',
 'Governance',
 'No_Policy_Content',
 'Asks_for_Donation',
 'Asks_you_to_watch_something_share_something_follow_something',
 'Misc',
 'Makes_a_Factual_or_Verifiable_Claim',
 'Expresses_an_Opinion']