### Example found at: http://zacstewart.com/2015/04/28/document-classification-with-scikit-learn.html

## 1: Positive, 0: Negative, 0.5: Neutral

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('abortion_data_clean.csv', usecols=['label', 'tweet_text'])
df = df.replace('1', 'for')
df = df.replace('0', 'against')
df = df.fillna(0.5)
df = df.replace(0.5, 'neutral')
df['tweet_text'] = df['tweet_text'].str.encode('utf-8')
df = df.dropna(axis=0, how='any')
df.head()

Unnamed: 0,label,tweet_text
0,against,b'million unborn children have lost their live...
1,against,b'#prolife #parenthood #babies #motherhood #pr...
2,against,b'definitely not safe for baby'
3,against,b'i think she wants to see herself as #prolife...
4,against,b'I hate abortion'


## Multinomial Naive Bayes

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer',  CountVectorizer(binary=False)),
    ('classifier',  MultinomialNB()) ])


In [3]:
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score

k_fold = KFold(n=len(df), n_folds=6)
scores = []
#confusion = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in k_fold:
    train_text = df.iloc[train_indices]['tweet_text'].values
    train_y = df.iloc[train_indices]['label'].values

    test_text = df.iloc[test_indices]['tweet_text'].values
    test_y = df.iloc[test_indices]['label'].values

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    #confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, average='micro')
    scores.append(score)

print('Total tweets classified:', len(df))
print('Score:', sum(scores)/len(scores))
#print('Confusion matrix:')
#print(confusion)



Total tweets classified: 3180
Score: 0.696855345912


examples = ['free on demand no apologies #marchforlife', 'abortion is murder', 'trump speaks on abortion at 4pm']
pipeline.predict(examples)

## Bernoulli Naive Bayes

In [4]:
from sklearn.naive_bayes import BernoulliNB

bern_pipeline = Pipeline([
    ('vectorizer',  CountVectorizer(binary=False)),
    ('classifier',         BernoulliNB(binarize=0.0)) ])

In [5]:
bern_k_fold = KFold(n=len(df), n_folds=6)
bern_scores = []
#confusion = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])
for train_indices, test_indices in bern_k_fold:
    train_text = df.iloc[train_indices]['tweet_text'].values
    train_y = df.iloc[train_indices]['label'].values

    test_text = df.iloc[test_indices]['tweet_text'].values
    test_y = df.iloc[test_indices]['label'].values

    bern_pipeline.fit(train_text, train_y)
    predictions = bern_pipeline.predict(test_text)

    #confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, average='micro')
    bern_scores.append(score)

print('Total tweets classified:', len(df))
print('Score:', sum(bern_scores)/len(bern_scores))

Total tweets classified: 3180
Score: 0.666352201258


In [6]:
examples = ['free on demand no apologies #marchforlife', 'abortion is murder', 'trump speaks on abortion at 4pm']
bern_pipeline.predict(examples)

array(['neutral', 'against', 'neutral'],
      dtype='<U7')

### Support Vector Machine

In [7]:
train_df = pd.read_csv('training_data.csv', usecols=['label', 'tweet_text'])
train_df = train_df.replace(1, 'for')
train_df = train_df.replace(0, 'against')
train_df = train_df.fillna(0.5)
train_df = train_df.replace(0.5, 'neutral')
train_df['tweet_text'] = train_df['tweet_text'].str.encode('utf-8')
train_df = train_df.dropna(axis=0, how='any')
train_df.head()

Unnamed: 0,label,tweet_text
0,for,b'along party lines fetal remains burial crema...
1,for,b'but firstanother bill in #ohio senate this w...
2,for,b'stand by for onslaught of triggered liberals...
3,for,b'are good for society there i said it #notsor...
4,for,b'to those fucks who go around spewing their v...


In [8]:
test_df = pd.read_csv('abortion_data_clean.csv', usecols=['label', 'tweet_text'])
test_df = test_df.replace('1', 'for')
test_df = test_df.replace('0', 'against')
test_df = test_df.fillna(0.5)
test_df = test_df.replace(0.5, 'neutral')
test_df['tweet_text'] = test_df['tweet_text'].str.encode('utf-8')
test_df = test_df.dropna(axis=0, how='any')
test_df.head()

Unnamed: 0,label,tweet_text
0,against,b'million unborn children have lost their live...
1,against,b'#prolife #parenthood #babies #motherhood #pr...
2,against,b'definitely not safe for baby'
3,against,b'i think she wants to see herself as #prolife...
4,against,b'I hate abortion'


### rbf kernel

In [9]:
from sklearn import svm
svm_rbf_pipeline = Pipeline([
    ('vectorizer',  CountVectorizer(binary=False)),
    ('classifier',         svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False)) ])

x = train_df.iloc[:]['tweet_text'].values
y = train_df.iloc[:]['label'].values


test_x = test_df.iloc[:]['tweet_text'].values
test_y = test_df.iloc[:]['label'].values

In [10]:
svm_rbf_pipeline.fit(x, y)
predicted = svm_rbf_pipeline.predict(test_x)
print(predicted)
np.mean(predicted == test_y) 

['against' 'against' 'against' ..., 'against' 'against' 'against']


0.25188679245283019

### linear kernel

In [14]:
svm_linear_pipeline = Pipeline([
    ('vectorizer',  CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier',         svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False)) ])

In [15]:
svm_linear_pipeline.fit(x, y)
linear_predicted = svm_linear_pipeline.predict(test_x)
print(linear_predicted)
np.mean(linear_predicted == test_y)

['against' 'against' 'against' ..., 'for' 'for' 'against']


0.86603773584905663

## KN neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

kn_pipeline = Pipeline([
    ('vectorizer',  CountVectorizer(binary=True, ngram_range=(1,2))),
    ('classifier',  KNeighborsClassifier(n_neighbors=1)) ])

kn_pipeline.fit(x, y)
kn_pipeline_predicted = kn_pipeline.predict(test_x)
print(kn_pipeline_predicted)
np.mean(kn_pipeline_predicted == test_y)
