In [25]:
import os
import io
import csv
import numpy as np
import pandas as pd

In [24]:
fns = [f for f in os.listdir() if f.startswith('IRAhandle')]
count = {}
total = {}
for ix, fn in enumerate(fns):
    count[str(ix+1)] = 0
    total[str(ix+1)] = 0
    with open(fn, "r") as f:
        print('starting on {}'.format(fn))
        reader = csv.reader(f, delimiter="\t")
        for i, line in enumerate(reader):
            total[str(ix+1)] += 1
            if '#' in line[0]:
                count[str(ix+1)] += 1
    print('{0:2.2f}% of tweets contain hashtags\n'.format(100*float(count[str(ix+1)]/total[str(ix+1)])))
    

starting on IRAhandle_tweets_1.csv
39.62% of tweets contain hashtags

starting on IRAhandle_tweets_2.csv
38.29% of tweets contain hashtags

starting on IRAhandle_tweets_3.csv
28.72% of tweets contain hashtags

starting on IRAhandle_tweets_4.csv
37.34% of tweets contain hashtags

starting on IRAhandle_tweets_5.csv
37.06% of tweets contain hashtags

starting on IRAhandle_tweets_6.csv
29.62% of tweets contain hashtags

starting on IRAhandle_tweets_7.csv
53.60% of tweets contain hashtags

starting on IRAhandle_tweets_8.csv
42.92% of tweets contain hashtags

starting on IRAhandle_tweets_9.csv
22.65% of tweets contain hashtags



In [26]:
frames = [pd.read_csv(fn, index_col=False) for fn in fns]
df = pd.concat(frames)

In [27]:
df.shape

(2973371, 15)

In [29]:
df.head(1)

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,9.06e+17,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,0,RightTroll


In [35]:
content = df.content.tolist()

In [36]:
import re

In [63]:
regex_str = [
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)+' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)

def preprocess(s, lowercase=True):
    tokens = tokenize(s)
    tokens = [token.lower() for token in tokens]

    html_regex = re.compile('<[^>]+>')
    tokens = [token for token in tokens if not html_regex.match(token)]

#     mention_regex = re.compile('(?:@[\w_]+)')
#     tokens = ['@user' if mention_regex.match(token) else token for token in tokens]

    url_regex = re.compile('http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+')
    tokens = ['!url' if url_regex.match(token) else token for token in tokens]

#     hashtag_regex = re.compile("(?:\#+[\w_]+[\w\'_\-]*[\w_]+)")
#     tokens = ['' if hashtag_regex.match(token) else token for token in tokens]
    flag = False
    for item in tokens:
        if item=='rt':
            flag = True
            continue
        if flag and item=='@user':
            return ''
        else:
            flag = False

    return ' '.join([t for t in tokens if t]).replace('rt @user : ','')

In [64]:
s = content[0]
print(preprocess(s))

"we have a sitting democrat us senator on trial for corruption and you've barely heard a peep from the mainstream media ." ~ @nedryun !url


In [71]:
outfile = 'tweet_tokens.txt'
with io.open(outfile, 'w') as tweet_processed_text:
    for ix, line in enumerate(content):
        try:
            tweet_processed_text.write(preprocess(line.rstrip())+'\n')
        except:
            tweet_processed_text.write(preprocess('unk')+'\n')

In [72]:
df.head(2)

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,9.06e+17,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,0,RightTroll
1,9.06e+17,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,0,RightTroll


In [73]:
df.post_type.unique()

array([nan, 'RETWEET', 'QUOTE_TWEET'], dtype=object)

In [74]:
df.account_type.unique()

array(['Right', 'Russian', 'Koch', 'Italian', 'left', '?', 'German',
       'Spanish', 'Hashtager', 'Arabic', 'local', 'Commercial', 'French',
       'Ukranian', 'ZAPOROSHIA', 'news', 'right', 'Uzbek', 'Ebola ', nan,
       'Portuguese'], dtype=object)

In [75]:
df.retweet.unique()

array([0, 1])

In [76]:
df.account_category.unique()

array(['RightTroll', 'NonEnglish', 'Fearmonger', 'LeftTroll', 'Unknown',
       'HashtagGamer', 'NewsFeed', 'Commercial'], dtype=object)

In [78]:
df.language.unique()

array(['English', 'Russian', 'Serbian', 'Ukrainian', 'Tagalog (Filipino)',
       'Albanian', 'Italian', 'Romanian', 'Spanish', 'Catalan', 'German',
       'Estonian', 'French', 'Norwegian', 'Vietnamese', 'Dutch', 'Arabic',
       'Uzbek', 'Bulgarian', 'Macedonian', 'Farsi (Persian)', 'Turkish',
       'LANGUAGE UNDEFINED', 'Czech', 'Somali', 'Lithuanian', 'Croatian',
       'Slovak', 'Icelandic', 'Slovenian', 'Japanese', 'Indonesian',
       'Pushto', 'Hungarian', 'Finnish', 'Latvian', 'Portuguese',
       'Danish', 'Swedish', 'Malay', 'Polish', 'Korean', 'Hebrew', 'Urdu',
       'Kurdish', 'Hindi', 'Greek', 'Simplified Chinese', 'Thai',
       'Bengali', 'Traditional Chinese', 'Gujarati', 'Kannada', 'Tamil',
       'Telugu', 'Malayalam'], dtype=object)

In [94]:
labels = []
label_names = []
cols = ['region','language','account_type','account_category']
for col in cols:
    x = pd.get_dummies(df[col].values, prefix='col_')
    print(col, x.shape[1])
    label_names += x.columns.tolist()
    labels = np.hstack((labels, x.values)) if labels != [] else x.values
    
for col in ['post_type','retweet']:
    x = df[col].values.reshape(-1,1)
    label_names.append(col)
    labels = np.hstack((labels, x))
    
labels.shape

region 36
language 56


  


account_type 20
account_category 8


(2973371, 122)

In [None]:
https://github.com/bdhingra/tweet2vec
https://github.com/fivethirtyeight/russian-troll-tweets/