In [1]:
import os
import re
import pandas as pd


def clean_text(text):
    text = re.sub(r"'", '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'pic.twitter\S+', '', text)
    text = re.sub(r'\W+', ' ', text.lower())

    return text


df = pd.read_csv(os.path.join('tweets', 'tweets.csv'),
                 low_memory=False)
df.drop_duplicates(inplace=True)
df['tweet-clean'] = df['tweet'].apply(clean_text)
drop_index = []

for i in range(len(df)):
    if df['tweet-clean'].iloc[i] in ('', ' '):
        drop_index.append(i)

df.drop(drop_index, inplace=True)

In [2]:
random_state = 0
n_jobs = -1

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
X = tfidf.fit_transform(df['tweet-clean'])

print(f'Number of documents: {X.shape[0]}')
print(f'Size of vocabulary:  {X.shape[1]}')

Number of documents: 34648
Size of vocabulary:  86092


In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['name'])

for i in range(len(le.classes_)):
    print(f'{le.classes_[i]:<15} = {i}')

Bernie Sanders  = 0
Donald J. Trump = 1


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                    test_size=0.5,
                                    random_state=random_state,
                                    stratify=y)

In [6]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(C=20, solver='saga',
                             random_state=random_state,
                             n_jobs=n_jobs)

clf_log.fit(X_train, y_train)
log_score = clf_log.score(X_test, y_test)
print(f'Logistic Regression accuracy: {log_score:.1%}')

Logistic Regression accuracy: 95.8%


In [7]:
from sklearn.naive_bayes import BernoulliNB

clf_bnb = BernoulliNB(alpha=0.01, binarize=0.09)
clf_bnb.fit(X_train, y_train)

BernoulliNB(alpha=0.01, binarize=0.09, class_prior=None, fit_prior=True)

In [8]:
import numpy as np

C_k = 'Bernie Sanders'
k = le.transform([C_k])[0]
i = tfidf.vocabulary_['medicare for']
p_ki = np.exp(clf_bnb.feature_log_prob_[k, i])
print(f'k = {k}')
print(f'i = {i}')
print(f'C_k = {C_k}')
print(f'p_ki = {p_ki:.3}')

k = 0
i = 44798
C_k = Bernie Sanders
p_ki = 0.0289


In [9]:
df_ki = clf_bnb.feature_count_[k, i]
n_k = clf_bnb.class_count_[k]
p_ki_manual = df_ki / n_k
print(f'{p_ki:.5}')
print(f'{p_ki_manual:.5}')

0.028924
0.028922


In [10]:
bnb_score = clf_bnb.score(X_test, y_test)
print(f'Bernoulli Naive Bayes accuracy: {bnb_score:.1%}')

Bernoulli Naive Bayes accuracy: 96.2%


In [11]:
from sklearn.ensemble import VotingClassifier

clf_vot = VotingClassifier(
    estimators=[('log', clf_log), ('bnb', clf_bnb)],
    voting='soft', weights=(0.6, 0.4), n_jobs=n_jobs)

clf_vot.fit(X_train, y_train)
vot_score = clf_vot.score(X_test, y_test)
print(f'Ensemble Averaging accuracy: {vot_score:.1%}')

Ensemble Averaging accuracy: 96.4%


In [12]:
clf_vot.fit(X, y)

VotingClassifier(estimators=[('log',
                              LogisticRegression(C=20, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto', n_jobs=-1,
                                                 penalty='l2', random_state=0,
                                                 solver='saga', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('bnb',
                              BernoulliNB(alpha=0.01, binarize=0.09,
                                          class_prior=None, fit_prior=True))],
                 flatten_transform=True, n_jobs=-1, voting='soft',
                 weights=(0.6, 0.4))