In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import math
import gzip

## domain score

In [2]:
domain_score = pd.read_csv("domains_ideology_score_LADA.txt", delimiter = "\t", header=None)
user_polarity = pd.read_csv("USER_POLARITY_BARBERA.txt", delimiter = "\t", header=None)

In [11]:
domain_score = domain_score.rename(columns={
    0: "domain",
    1: "score"
})
# score close to 1: conservative
# score close to 0: liberal
"""
obtained from Bakshy et al, 
removed a few domains that are not owned by news organizations (e.g., wikipedia.org or reddit.com), 
and added shortened versions of news domains to the list (e.g. fxn.ws for foxnews.com).
"""

In [13]:
domain_score.head(5)

Unnamed: 0,domain,score
0,weaselzippers.us,1.0
1,vote.gop,1.0
2,tedcruz.org,1.0
3,donaldjtrump.com,1.0
4,billoreilly.com,0.99


## user info

In [5]:
obamacare_userinfo = pd.read_csv("obamacare/userinfo.txt", delimiter = "\t", header = None, lineterminator='\n')

In [14]:
"""
dataset obtained using Archive Twitter Stream grab
"""
obamacare_userinfo = obamacare_userinfo.rename(columns={
    0:  "user_ID",
    1:  "twitter_ID",
    2:  "handle_name",
    3:  "location",     # self-reported
    4:  "bio",
    5:  "followers",    # my speculation
    6:  "following",    # my speculation
    7:  "num_tweets",
    8:  "date",
    11: "language",
    13: "external_link",
    16: "profile_pic"
})

In [45]:
# I think column 14 and 15 are empty, there are just multiple tabs between urls
# and after the following code, I double checked the source file

# obamacare_userinfo[obamacare_userinfo[14].notna()]
# obamacare_userinfo[obamacare_userinfo[15].notna()]

obamacare_userinfo = obamacare_userinfo.drop([14, 15], axis=1)

In [51]:
# check column 9
# still not sure what it is
obamacare_userinfo[obamacare_userinfo[9] != False].head(5)

Unnamed: 0,ID,twitter_ID,handle_name,location,bio,followers,following,num_tweets,date,9,10,language,12,external_link,profile_pic
231,872322799,aehooker501,TRUMP'DUPTRIKElDNECO,Los Angeles,Watching US Sanctions on Russia. the Anex of K...,1608,3093,36171,Wed Oct 10 18:48:15 +0000 2012,True,False,en,True,,http://pbs.twimg.com/profile_images/8944142628...
332,127311364,alexa_io,Alejandra,,=),660,106,47405,Sun Mar 28 19:54:33 +0000 2010,True,False,es,False,,http://pbs.twimg.com/profile_images/8142427151...
349,272497924,aleykhat,WhiskeyTangoFoxtrot,The Lone Star State,Proud Mom and Wife. Conservative. Navy Vet. RN...,4863,4841,43491,Sat Mar 26 16:49:05 +0000 2011,True,False,en,False,,http://pbs.twimg.com/profile_images/8355361154...
384,898503012,allybrooke27,standwithvegas,"Connecticut, USA",Trump is #mypresident. I am so proud to be an ...,3158,169,115056,Mon Oct 22 21:55:30 +0000 2012,True,False,en,True,,http://pbs.twimg.com/profile_images/9162949810...
402,455996058,AlysiaStern,Alysia,"New York, NY",just me.,39397,24611,105752,Thu Jan 05 19:15:51 +0000 2012,True,False,en,False,,http://pbs.twimg.com/profile_images/9133868776...


In [46]:
obamacare_userinfo.head(5)

Unnamed: 0,ID,twitter_ID,handle_name,location,bio,followers,following,num_tweets,date,9,10,language,12,external_link,profile_pic
0,334537201,00_jackie,Jackie ️,♥•♥ Oklahoma ♥•♥,"Saved sinner, Conservative, pro Israel, pro Mi...",6115,1041,39411,Wed Jul 13 07:48:53 +0000 2011,False,False,en,False,https://t.co/dMTSML3qwc,http://pbs.twimg.com/profile_images/6095617438...
1,115094250,0103lrl,Larry Lawson (D),,"Progressive Democrat! We're wounded, but not d...",5534,5358,177943,Wed Feb 17 15:57:36 +0000 2010,False,False,en,True,,http://pbs.twimg.com/profile_images/8450958772...
2,2369916919,055Douglas,🔴 Douglas 🔴,,"I enjoy music, warm weather, gardening. Workin...",5038,4762,46476,Mon Mar 03 05:36:02 +0000 2014,False,False,en,True,,http://pbs.twimg.com/profile_images/8934867018...
3,1146470090,100Concerned,endure,,"photography,music, travel, nature,do not buy f...",2015,2358,179716,Sun Feb 03 23:04:27 +0000 2013,False,False,en,True,,http://pbs.twimg.com/profile_images/3788000007...
4,95708744,100Natural,100%Natural,México,Grupo de Franquicias de Restaurantes que promu...,2198,362,14554,Wed Dec 09 19:01:12 +0000 2009,False,False,es,True,http://t.co/smkBBf2sLc,http://pbs.twimg.com/profile_images/9143582449...


## production score

In [52]:
user_production_score = pd.read_csv("obamacare/users_production_scores.txt", delimiter = "\t", header = None, lineterminator='\n')

In [57]:
user_production_score = user_production_score.rename(columns={
    0: "user_ID"
})

In [58]:
user_production_score

Unnamed: 0,user_ID,1,2,3
0,334537201,0.734128,0.223540,391.0
1,115094250,0.226682,0.152557,294.0
2,2369916919,0.288123,0.183243,160.0
3,1146470090,0.305678,0.169161,517.0
4,95708744,0.347375,0.000000,1.0
...,...,...,...,...
8614,1231303752,0.632540,0.253130,87.0
8615,14622223,0.399442,0.082604,928.0
8616,57134276,0.301048,0.095428,594.0
8617,327715590,0.745652,0.220540,104.0


## tweets

In [35]:
def prepare_df(df):
    df = df.rename(columns={
    0:  "user_ID",
    1:  "tweet_ID",
    2:  "tweet_content",
    3:  "date",
    4:  "tweet_link",
    5:  "likes",        # my speculation
    6:  "comments",     # my speculation
    7:  "retweets"      # my speculation
})
"""
some problem I was thinking about: 
1.  all of the data need to be in memory at the same time,
    thus even if we create a database by chunk, it is still hard to do the ML part
    
2.  then i was thinking about a way to process them by chunk:
    can we implement the algorithm by hand, then process them by chunk?
    
3.  the problem with implementing algorithm by hand: 
        I did it for my MATH156 class, and it was extremely slow compared to scikit-learn
        
despite the problems, I still implemented the code to create a database by chunks
in case we'll need it
"""

import sqlite3
conn = sqlite3.connect("obama.db") 
df_iter = pd.read_csv("obamacare/USER_TWEETS.txt", 
                      delimiter = "\t", 
                      header = None, 
                      lineterminator='\n',
                      chunksize = 100000)
for df in df_iter:
    df = prepare_df(df)
    df.to_sql("tweets", conn, if_exists = "append", index = False)

In [36]:
obamacare_tweets = pd.read_csv("obamacare/USER_TWEETS.txt.gz", compression = "gzip",
                               delimiter = "\t", header = None, lineterminator = "\n",
                                error_bad_lines=False, nrows=1000000)


obamacare_tweets

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,334537201,917247622502584320,@AZWS @GovMaryFallin Hi 👋🏼 Yes it has. Thank y...,Mon Oct 09 04:37:29 +0000 2017,"<a href=""http://twitter.com/download/iphone"" r...",1,N,0,,,AZWS GovMaryFallin,N,,
1,334537201,917246099190140929,RT @redsteeze: Where is Donald Trump from agai...,Mon Oct 09 04:31:26 +0000 2017,"<a href=""http://twitter.com/download/iphone"" r...",898,N,0,,https://t.co/9uTcmCyRD5--a0a--https://twitter....,redsteeze,N,,
2,334537201,917245534288728064,#douchebag @Kaepernick7 👈🏼 https://t.co/ttu8oY...,Mon Oct 09 04:29:11 +0000 2017,"<a href=""http://twitter.com/download/iphone"" r...",0,N,0,douchebag,https://t.co/ttu8oYFldA--a0a--https://twitter....,Kaepernick7,N,,
3,334537201,917244832166371328,RT @chuckwoolery: Health Insurers in FL Reques...,Mon Oct 09 04:26:24 +0000 2017,"<a href=""http://twitter.com/download/iphone"" r...",355,N,0,,https://t.co/ndlI4KkHF6--a0a--https://bluntfor...,chuckwoolery,N,,
4,334537201,917244447615799296,RT @MarkYoungTruth: Working on my resume’ for ...,Mon Oct 09 04:24:52 +0000 2017,"<a href=""http://twitter.com/download/iphone"" r...",91,N,0,bluntforcetruth,,MarkYoungTruth,N,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,1357799989,871983133243179008,Desde el fondo de mi ❤️...,Tue Jun 06 06:52:34 +0000 2017,"<a href=""http://twitter.com/#!/download/ipad"" ...",0,N,0,,,,N,,
999996,1357799989,871982661149044736,RT @UHumanitas: Siempre proponte ir cuesta arr...,Tue Jun 06 06:50:41 +0000 2017,"<a href=""http://twitter.com/#!/download/ipad"" ...",983,N,0,,,UHumanitas,N,,
999997,1357799989,871982263096967168,@Iovintit Perdón estaba hablando de negocios h...,Tue Jun 06 06:49:06 +0000 2017,"<a href=""http://twitter.com/#!/download/ipad"" ...",0,N,0,,,,N,,
999998,1357799989,871981424244641793,Gloria trevi Versus Tour //Me hubiera gustado ...,Tue Jun 06 06:45:46 +0000 2017,"<a href=""http://twitter.com/#!/download/ipad"" ...",0,N,0,,https://t.co/d776AaX7Xx--a0a--https://youtu.be...,,N,,


In [44]:
obamacare_tweets.loc[1000][2]

'RT @badov49: REPORT: Anti-Trump Senator Mark Warner Made $6 Million in 2012 From Russian Tech Business https://t.co/YZ7WZvYXHk'

## networks

In [2]:
networks = pd.read_csv("obamacare/FULL_FOLLOWER_NETWORK.txt.gz", compression = "gzip",
                               delimiter = "\t", header = None, lineterminator = "\n",
                                error_bad_lines=False)

In [16]:
networks.head(6000)

Unnamed: 0,0,1
0,334537201,2652830652
1,334537201,31623445
2,334537201,907991059694792704
3,334537201,709158721746804737
4,334537201,1623194148
...,...,...
5995,334537201,84062050
5996,334537201,186618551
5997,334537201,27427649
5998,334537201,325739781
