Take a random non-stratified sample from all files in the no retweets or quotes folder.

### Version with Functions

In [1]:
import random
import os
import json

sample_size = 4000
result = [] #The reservoir.
N = 0
hashtags = set() #A set of desired hashtags.

with open("partisan_hashtags.txt", "r") as f:
    hashtags = set(l[:-1] for l in f.readlines()) #Each line in the .text file ends with /n (new line), so have to cut off the last two characters with [:-1].

def hashtag_text(hashtag):
    return hashtag["text"].lower()

def has_hashtag(tweet):
    tweet_hashtags = set(map(hashtag_text, tweet["entities"]["hashtags"]))
    desired_hashtags = tweet_hashtags & hashtags #Intersection of the two sets.
    return len(desired_hashtags) >= 1 #Returns true if there is at least one desired hashtag in the tweet.

def is_a_reply(tweet):
    return tweet["in_reply_to_status_id"] is not None #Returns tweets that are not replies.

for filename in os.listdir("."):
    if os.path.isfile(filename) and filename.endswith(".json"):
        print("processing: " + filename)
        with open(filename, "r") as f:
            for line in f:
                if line.startswith('{"limit":{'):
                    continue
                tweet = json.loads(line)
                if not has_hashtag(tweet) or is_a_reply(tweet): #Filters out tweets that do not contain a hashtag or are replies. 
                    continue
                if N < sample_size:
                    N += 1
                    result.append(line)
                else:
                    N += 1
                    s = random.randint(0, N-1)
                    if s < sample_size:
                        result[s] = line
                        
with open("reservoir_sample_4000.json", "w") as outfile:
    outfile.write("".join(result))

processing: 2016-04-15_without_quotes.json
processing: 2016-04-16_without_quotes.json
processing: 2016-04-17_without_quotes.json
processing: 2016-04-18_without_quotes.json
processing: 2016-04-19_without_quotes.json
processing: 2016-04-20_without_quotes.json
processing: 2016-04-22_without_quotes.json
processing: 2016-04-23_without_quotes.json
processing: 2016-04-24_without_quotes.json
processing: 2016-04-25_without_quotes.json
processing: 2016-04-26_without_quotes.json
processing: 2016-04-27_without_quotes.json
processing: 2016-04-28_without_quotes.json
processing: 2016-04-29_without_quotes.json
processing: 2016-04-30_without_quotes.json
processing: 2016-05-01_without_quotes.json
processing: 2016-05-02_without_quotes.json
processing: 2016-05-03_without_quotes.json
processing: 2016-05-04_without_quotes.json
processing: 2016-05-05_without_quotes.json
processing: 2016-05-06_without_quotes.json
processing: 2016-05-07_without_quotes.json
processing: 2016-05-08_without_quotes.json
processing:

### Version without Functions (Does not work!)

In [None]:
import random
import os
import json

sample_size = 4000
result = [] #The reservoir.
N = 0
desired_hashtags = set()

for filename in os.listdir("."):
    if os.path.isfile(filename) and filename.endswith(".json"):
        print("processing: " + filename)
        with open(filename, "r") as f:
            for line in f:
                try:
                    parsed_line = json.loads(line) #Parse the JSON (otherwise it is just text).
                except Exception:
                    continue
                if parsed_line["in_reply_to_status_id"] or not parsed_line["entities"]["hashtags"]:
                    continue
                if not len(desired_hashtags & set(parsed_line["entities"]["hashtags"]["text"].lower())) >= 1:
                    continue
                N += 1
                if len(result) < sample_size:
                    if line.startswith('{"limit":{'):
                        N -= 1 #As lines beginning with "limit" are skipped, N should not be incremented for them (would bias the results).
                        continue #Proceed with the next iteration of the loop (not executing the rest of the loop).
                    result.append(line)
                else:
                    s = int(random.random() * N)
                    if s < sample_size: 
                        if line.startswith('{"limit":{'):
                            N -= 1
                            continue
                        result[s] = line
                        
with open("4000_sample.json", "w") as outfile:
    outfile.write("".join(result))

### Sort the list of hashtags and convert to lowercase

In [None]:
l = ["#otherref", "#HearOtherVoices", "#CatsAgainstBrexit", "#CatsforBrexit", "#Cats4Brexit", "#Cats4Britain", "#PetsforBritain", "#Pets4Britain", "#Pets4Brexit", "#PetsforBrexit", "#PetsAgainstBrexit", "#Mutts4Remain", "#DogsAgainstBrexit", "#DogsforBrexit", "#Dogs4Brexit", "#Dogs4Britain", "#BrexitBusTour", "#standupforeurope", "#votin", "#hugabrit", "#whyvoteleave", "#BrexitLies", "#BrexitAndChill", "#BrexitTheMovie", "#GoodbyeEU", "#LeaveEurope", "#BadBrexit", "#Brexiteer", "#BrexitFantasy", "#BritsDontQuit", "#CloutNotOut", "#LeadingNotLeaving", "#BankersLoveBrussels", "#fishingforleave", "#wewantourfishback", "#britinds4in", "#EUisTheProblem", "#LeftLeave", "#LeaveAlliance", "#TheLeaveAlliance", "#43brokenpromises",  "#beleave", "#BeLeaver", "#BelfastGO", "#BetterIn", "#betterineu", "#betterineurope", "#BetterOffIn", "#BetterOffOut", "#betterout", "#bettertogether", "#bolstertheborders", "#Bremain", "#Brexin", "#Brexitfears", "#Brexitnow", "#Brexitrisks", "#brexodus", "#britainfirst", "#BritainIn", "#BritainOut",  "#britin", "#ByeByeEU", "#campaigntoremain", "#Conservatives4Britain", "#ConservativesforBritain", "#ConservativesIn", "#davesdodgydeal", "#DontWalkAway",      "#eugood4uk", "#EUin", "#EUleave", "#EUout", "#EURefIn", "#EURefOut", "#EUremain",  "#EUstay", "#exiteu", "#fucktheEU", "#GetBritainOut", "#goactionday", "#GOSuperSaturday", "#grassroots_out", "#grassrootsout", "#grassrootsoutgo", "#greenerin", "#incampaign",  "#Intogether", "#justnotintoEU", "#labourgo", "#LabourIn", "#LabourInForBritain", "#LabourLeave", "#LeadNotLeave", "#Leave", "#leavechaos", "#LeaveEU", "#leaveeuofficial", "#leavetheeu", "#leaveuk", "#LeavingEU", "#LetsTakeBackControl", "#London4Europe", "#LoveEuropeLeaveEU", "#makebritaingreatagain", "#no2brexit", "#No2EU", "#nobrexit", "#notbetteroffout", "#nothankseu", "#notintoeu", "#notobrexit", "#NotoEU", "#outcampaign", "#outeu", "#pleasevoteleave", "#proeu", "#proeurope", "#projectbullshit", "#projectcheer", "#ProjectFact", "#projectfantasy", "#ProjectFear", "#projecthope",  "#ProjectNasty", "#projectpanic", "#projectsmear", "#projecttruth", "#projectwhinge", "#Remain", "#remainandgain", "#remainer", "#remainers", "#RemainEU", "#remainiac", "#remainiacs", "#Remainian", "#remainians", "#remainin", "#RemaininEU", "#saferout",  "#SayNo2EU", "#SayNoToEU", "#SayYesToEU", "#stayEU", "#stayin", "#StayinEU", "#StayinEurope", "#StoptheEU", "#StrongerIn", "#strongerout", "#studentsin", "#uk4eu",  "#UKin", "#UKinEU", "#UKinEurope", "#UKout", "#UKoutEU", "#UKtoStay", "#unisforeu",  "#vote_leave", "#vote2stay", "#VoteIn", "#VoteLeave", "#VoteLeaveEU", "#VoteOut", "#VoteRemain", "#VoteStay", "#VoteToLeave", "#womenforbritain", "#Yes2EU", "#yeseu", "#YestoEU", "#RisksofStaying", "#takecontrol", "#WeAreEurope", "#infor", "#HealthierIn", "#Lexit", "#ImLeaveBecause", "#flexcit", "#BritainInEurope", "#StudentsGO", "#BusinessGO"]

#Sort in alphabetical order and make each item in the list lowercase, starting from the second character (as the first character is a #).
#This produces a list; each hashtag is in quotes.
#sorted(x[1:].lower() for x in l)

#This prints only the items in the list (no commas, no quotes). Easy export to .txt file.
for y in sorted(x[1:].lower() for x in l):
    print(y)

### Test of script that opens the hashtag .txt file as a list

In [None]:
h = []
with open("partisan_hashtags.txt", "r") as f:
    h = [l[:-1] for l in f.readlines()]
h