In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
%matplotlib inline  
import tweepy
import csv

In [65]:
tweets_df = pd.read_csv('tweets_final3.csv', names = ["username", "tweets_raw", "politics"])

In [66]:
# Various methods for cleaning tweet strings to make them better suited for sentiment analysis

def remove_emojis(string):
    
    corrected = re.sub(r'\\x[\w+]{2}', '', string)
    
    return corrected

def correct_apostrophes(string):
    
    corrected = re.sub(r'\\xe2\\x80\\x99', '\'', string)
    
    corrected = re.sub(r'\&amp;', 'and', string)
    
    return corrected

def remove_byte_encoding(string):
    return string[2:]

def clean_tweet(tweet):
    
            
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        return re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet)
    
def clean_tweet(tweet):
    
        parts = tweet.split('.')
        
        for i in range(len(parts)):
            parts[i] = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", parts[i])
            
        cleaned_tweet = " ".join(parts)
        
        return cleaned_tweet
    
def clean_string(string):
    
    temp = remove_byte_encoding(string)
    temp = correct_apostrophes(temp)
    temp = remove_emojis(temp)
    final = clean_tweet(temp)
    
    return final

def clean_dataframe_tweets(dataframe):
    
    new_strings = []
    
    for index, row in dataframe.iterrows():
        new_strings.append(clean_string(row['tweets_raw']))
    
    return new_strings

In [67]:
cleaned = clean_dataframe_tweets(tweets_df)

In [68]:
tweets_df['tweets_clean'] = cleaned

In [69]:
from nltk import FreqDist
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
import string

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [70]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')

en_stop = get_stop_words('en')

In [71]:
def preprocess_corpus(corpus):
    
    doc_clean = [clean(doc).split() for doc in corpus]  
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    
    return doc_term_matrix, dictionary

In [72]:
def generate_LDA_model(corpus, num_topics=15):
    
    new_corpus, dictionary = preprocess_corpus(corpus)
    
    ldamodel = gensim.models.ldamodel.LdaModel(new_corpus, num_topics=num_topics, id2word = dictionary, passes=2)
    
    return ldamodel
    

In [79]:
model = generate_LDA_model(tweets_df['tweets_clean'])

In [74]:

def get_topic_distribution(string, ldamodel):
    
    tokens = tokenizer.tokenize(string)
    
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    texts = [p_stemmer.stem(i) for i in stopped_tokens]
    
    bow = ldamodel.id2word.doc2bow(texts)
    
    return ldamodel.get_document_topics(bow)
    

def tweet_to_tdistrib(tweet, ldamodel, second=False):
    t_distrib = get_topic_distribution(tweet, ldamodel)
    
    if second:
        return pd.Series(dist_to_row(t_distrib), index = ['10', '11', '12', '13', '14', '15', '16', '17', '18', '19'])
    
    return pd.Series(dist_to_row(t_distrib))

def dist_to_row(dist, num_topics=15):
    
    row = []
    
    for i in range(num_topics):
        row.append(0)
    
    for topic in dist:
        row[topic[0]] = topic[1]
    
    return row

In [75]:
def tweet_to_row(tweet, lda1):
    
    new_row = tweet_to_tdistrib(tweet, lda1)

    return new_row

In [76]:
def get_topic_features(dataframe, ldamodel):
    
    df = pd.DataFrame()
    
    for index, row in dataframe.iterrows():
        
        tweet = row['tweets_clean']
        
        new_row = tweet_to_row(tweet, ldamodel)
        
        df = df.append(new_row, ignore_index=True)

    return df
        

In [77]:
df_topics = get_topic_features(tweets_df, model)

In [78]:
df_topics.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.08705,0.0,0.14609,0.0,0.047136,0.0,0.57648,0.0,0.055894,0.0,0.061262,0.0,0.0,0.0
1,0.0,0.073384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152166,0.197141,0.215345,0.0,0.0,0.32863
2,0.0,0.0,0.349944,0.0,0.301475,0.0,0.063312,0.10949,0.0,0.0,0.0,0.068411,0.0,0.078796,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290406,0.642928
4,0.197156,0.0,0.068353,0.0,0.062705,0.0,0.0,0.447892,0.0,0.0,0.0,0.0,0.186857,0.0,0.0


In [23]:
from textblob import TextBlob

In [24]:
def get_sentiment(string):
    text = TextBlob(string)
    
    return text.sentiment

def get_dataframe_sentiments(dataframe):
    
    sentiments = []
    
    for index, row in dataframe.iterrows():
        sentiments.append(get_sentiment(row['tweets_clean']))
        
    return sentiments

In [25]:
sentiments = get_dataframe_sentiments(tweets_df)

In [31]:
tweets_df.shape

(66572, 4)

In [37]:
tweets_df['tweets_raw'][3]

"b'Enjoyed assembling snack packages with colleagues for our deployed service members! Small token of our great appreciation for your service &amp; commitmet to our nation \\xf0\\x9f\\x87\\xba\\xf0\\x9f\\x87\\xb8 @the_USO #BeTheForce https://t.co/06zJWntkIv'"

In [38]:
model.print_topics()

[(0,
  '0.031*"co" + 0.010*"infrastructure" + 0.008*"bush" + 0.008*"de" + 0.007*"la" + 0.007*"development" + 0.006*"new" + 0.006*"en" + 0.006*"first" + 0.006*"community"'),
 (1,
  '0.048*"co" + 0.024*"thank" + 0.020*"woman" + 0.018*"service" + 0.015*"honor" + 0.014*"day" + 0.014*"today" + 0.013*"work" + 0.013*"year" + 0.011*"life"'),
 (2,
  '0.046*"co" + 0.024*"school" + 0.023*"student" + 0.021*"high" + 0.017*"texas" + 0.017*"congressional" + 0.011*"year" + 0.010*"capitol" + 0.009*"art" + 0.008*"irs"'),
 (3,
  '0.044*"taxreform" + 0.023*"co" + 0.022*"h" + 0.020*"r" + 0.014*"act" + 0.013*"vote" + 0.013*"bill" + 0.012*"passed" + 0.011*"house" + 0.007*"floor"'),
 (4,
  '0.029*"co" + 0.020*"2" + 0.018*"year" + 0.018*"american" + 0.017*"tax" + 0.017*"1" + 0.013*"taxcutsandjobsact" + 0.012*"4" + 0.011*"million" + 0.010*"000"'),
 (5,
  '0.034*"co" + 0.011*"opioid" + 0.009*"crisis" + 0.009*"people" + 0.009*"make" + 0.008*"american" + 0.008*"school" + 0.008*"country" + 0.008*"work" + 0.008*"nee

In [40]:
tweets_df['tweets_clean'][0]

'Touching and productive conversation w  Florida student survivors of gun violence  Were listening  And agree we need ACTION to stop this epidemic  I stand with our young leaders in support of sensible gun laws   n NeverAgain   co gCiOtA6qeF '

In [43]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from matplotlib import pyplot as plt

In [45]:
mapping = {
           'Democrat': 0,
           'Republican': 1}

In [60]:
X = df_topics1
y = tweets_df['politics'].map(mapping)

In [61]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model.fit(X,y)
model.score(X,y)

0.5296070419996395

In [50]:
tweets_df['sentiment'] = sentiments

In [62]:
def apply_sentiment(dataframe, sentiment):
    df = dataframe.mul(sentiment, axis=0)
    return df

def get_sentiments(dataframe):
    
    s = []
    
    for index, row in dataframe.iterrows():
        s.append(row['sentiment'][0])
    
    return s

In [63]:
s = get_sentiments(tweets_df)

In [59]:
df_topics1 = apply_sentiment(df_topics, s)