In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from nltk.tokenize.casual import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# initialize tokenizer
tokenizer = TweetTokenizer(reduce_len=True)

In [2]:
all_tweets = pd.read_csv(r'C:\Users\Ye-An\Desktop\Datasets\ExtractedTweets.csv')

In [3]:
tweeters = all_tweets.iloc[:,:2].drop_duplicates()
handles_train, handles_test = train_test_split(tweeters.Handle, stratify=tweeters.Party, test_size=0.2, random_state=0)
train = all_tweets[all_tweets.Handle.isin(handles_train)].reset_index().drop('index', axis=1)
test = all_tweets[all_tweets.Handle.isin(handles_test)].reset_index().drop('index', axis=1)

In [4]:
nb_pipeline = Pipeline([
    ('vectorize', TfidfVectorizer(tokenizer=tokenizer.tokenize)),
    ('classifier', MultinomialNB())
])

In [5]:
nb_pipeline.fit(train.Tweet, train.Party)

Pipeline(memory=None,
     steps=[('vectorize', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...True, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [6]:
preds = nb_pipeline.predict(test.Tweet)
print('Accuracy: {}'.format(str(round(accuracy_score(test.Party, preds), 4))))

Accuracy: 0.7434


In [7]:
import time

In [18]:
print("---------------------------------------------")
print("     P A R T Y       C L A S S I F I E R     ")
print("---------------------------------------------")
print("Enter 'q' to quit.")
print("Topics to talk about: net neutrality, Trump, immigration, guns")
print(">>")
while True:
    time.sleep(0.7)
    series = pd.Series([input(":: ")])
    if series[0] == 'q':
        break
    print("Classification:")
    print(nb_pipeline.predict(series)[0])
    print(">>")

---------------------------------------------
     P A R T Y       C L A S S I F I E R     
---------------------------------------------
Enter 'q' to quit.
Topics to talk about: net neutrality, Trump, immigration, guns
>>
:: Trump is really great, protecting our borders!
Classification:
Republican
>>
:: Trump is a lousy, unfit president for our country. Horrible, bad!
Classification:
Democrat
>>
:: Trump is attacking our liberties, separating families and being racist!
Classification:
Democrat
>>
:: Trump is fighting for our gun rights, preserving our liberties, and keeping our borders & country safe!
Classification:
Republican
>>
:: q
