In [5]:
import matplotlib.pyplot as plt
%matplotlib inline


## Sampling a Data Stream

The TwitterStream class defined below is used to simulate a Twitter stream. It works the same way as a list, tuple or any other iterables that you may have worked with before --- you can loop over it to receive one tweet at a time. Each tweet may or may not contain emojis. There's also a helper function extract_emojis that helps you extract all emojis from a piece of text. It may be also useful to know that the variable UNICODE_EMOJI is a collection of all emojis that are circulating around the world.

In [6]:
import json
import emoji
import re

def extract_emojis(text):
    """
    Extract all emojis from a str using regular expressions
    """
    emoji_pattern = re.compile(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U0001F200-\U0001F251\u200D‚ôÇÔ∏è\u200D‚ôÄÔ∏è]+')
    return emoji_pattern.findall(text)

class TwitterStream:
    """
    Used to simulate a Twitter stream. 
    """
    
    def __init__(self, data_file):
        self.data_file = data_file
        self.data = open(self.data_file, "r")
    
    def __iter__(self):
        return self.reset()
    
    def __next__(self):
        next_line = self.data.readline()
        if next_line:
            return json.loads(next_line)["text"]
        else:
            raise StopIteration
    
    def __del__(self):
        if not self.data.closed:
            self.data.close()
    
    def reset(self):
        if not self.data.closed:
            self.data.close()
        self.data = open(self.data_file, "r")
        return self

### Random Sampling

In [7]:
from collections import defaultdict
from random import Random

# Define HistPresvRandom
class HistPresvRandom:
    """
    History-preserving Random Number Generator
    """
    
    def __init__(self, seed=None):
        self.prg = Random(seed)
        self.hist = defaultdict(list)
    
    def random(self): # works exactly like random.random()
        num = self.prg.random()
        self.hist["random"].append(num)
        return num
    
    def sample(self, population): # works exactly like random.sample(population, 1)[0]
        num = self.prg.sample(population, 1)[0]
        self.hist["sample"].append(num)
        return num
## Define random sampling algorithm
class RandomSampler:
    
    def __init__(self, in_sample_prob, seed=None):
        
        self.in_sample_prob = in_sample_prob
        self.random = HistPresvRandom(seed) # used whenever randomness is needed in your solution
        self.sample, self.counts = list(), defaultdict(int) # recommended to use defaultdict, but an ordinary dict works fine too
    
    def _process_new_item(self, item):
        """
        Applies random sampling to a newly arrived item
        """

        if self.random.random() < self.in_sample_prob:
            self.sample.append(item)
            
            for emoji in extract_emojis(item):
                self.counts[emoji] += 1
            
      
    
    def do_sampling(self, stream):
        """
        Iterates over a stream and performs random sampling
        """
        
        self.sample.clear() # clear the existing sample
        self.counts.clear() # clear the existing counts
        
        for item in stream:
            # iterate over the stream
            self._process_new_item(item)
            
            # returns a copy of sample and counts at the end of every iteration for grading - code given
            yield self.sample.copy(), self.counts.copy()

Now, let's see what the emoji distribution is after all tweets are processed

In [8]:
in_sample_prob, seed = 0.1, 42
ran_emo = RandomSampler(in_sample_prob, seed)

# Do sampling. Don't have to collect the results. Just exhaust the stream
for _ in ran_emo.do_sampling(TwitterStream("Data/tweets")):
    pass

sorted_counts = {emoji: ran_emo.counts[emoji] for emoji in sorted(ran_emo.counts.keys(), key=ran_emo.counts.get, reverse=True)}
print(sorted_counts)

{'Ô∏è': 141, 'üòÇ': 34, 'ü•∫': 19, 'üòç': 16, 'üò≠': 15, 'üòä': 12, 'üòÇüòÇüòÇ': 11, 'ü§£': 10, 'üî•': 9, 'üò≠üò≠': 8, 'ü§î': 8, 'üòÇüòÇ': 7, 'üòÜ': 7, 'üíõ': 7, 'üëç': 7, 'üíï': 6, 'ü•∞': 6, 'üåü': 6, 'üòÅ': 6, 'üí¶': 6, 'üòé': 6, 'üò¢': 6, 'ü§≠': 6, 'üí™': 5, 'üòÇüòÇüòÇüòÇ': 5, 'ü¶ã': 5, 'üëá': 5, 'üôÑ': 5, 'üéâ': 5, 'üòÖ': 5, 'üòò': 4, 'üå∏': 4, 'üå∫': 4, 'üëè': 4, 'üíô': 4, 'üö®üö®': 4, 'üó£': 4, 'üò≥': 4, 'ü•≥': 4, 'üèÖ': 4, 'üòå': 3, 'üòî': 3, 'üèª': 3, 'üíù': 3, 'üò©': 3, 'üîÉ': 3, 'ü§æ\u200d‚ôÇÔ∏è': 3, 'üçë': 3, 'üò∫': 3, 'üî•üî•': 3, 'ü§£ü§£': 3, 'üòà': 3, 'üëã': 3, 'üëÄ': 3, 'üíó': 3, 'üê∞': 3, 'üíñ': 3, 'üì±': 3, 'üò•': 3, 'üí≠': 3, 'üíö': 3, 'üê∂': 2, 'üíî': 2, 'üôå': 2, 'ü•¥': 2, 'üò°': 2, 'üò±üò±üò±': 2, 'üì¢': 2, 'üòÜüòÜ': 2, 'üñ§': 2, 'üíé': 2, 'üó£Ô∏è': 2, 'ü•µ': 2, 'üéÄ': 2, 'üîí': 2, 'üôÇ': 2, 'üëèüèª': 2, 'üôÉ': 2, 'üòá': 2, 'üòÉ': 2, 'ü•∫ü•∫ü•∫': 2, 'üëãüèª': 2, 'üôè'

### Reservior Sampling

In [9]:
from collections import defaultdict

class ReservoirSampler:
    
    def __init__(self, sample_size, seed=None):
        
        self.sample_size = sample_size
        self.random = HistPresvRandom(seed) # used whenever randomness is needed in your solution
        self.sample, self.counts = list(), defaultdict(int)
    
    def _process_new_item(self, item, index):
        """
        Decides whether a new item should be added to the sample and adjusts the counts accordingly
        """
        
        
        prob = self.sample_size / (index + 1)
        
        if self.random.random() <= prob:
            idx_to_remove = self.random.sample(range(self.sample_size))
            removed_item = self.sample.pop(idx_to_remove)
            
            for emoji in extract_emojis(removed_item):
                self.counts[emoji] -= 1
                if self.counts[emoji] <= 0:
                    del self.counts[emoji]
            self.sample.append(item)
            
            for emoji in extract_emojis(item):
                self.counts[emoji] += 1
     
    
    def do_sampling(self, stream):
        """
        Iterates over a stream and performs reservoir sampling
        """
        
        self.sample.clear() # clear the existing sample
        self.counts.clear() # clear the existing counts
        
        for index, item in enumerate(stream): # iterate over the stream

            
            
            if index < self.sample_size:
                self.sample.append(item)
                for emoji in extract_emojis(item):
                    self.counts[emoji] += 1
            else:
                self._process_new_item(item, index)
                    
            
            
            # returns a copy of sample and counts at the end of every iteration for grading - code given
            yield self.sample.copy(), self.counts.copy()

Let's see what the emoji distribution is after all tweets processed.

In [10]:
sample_size, seed = 100, 0
stu_ans = ReservoirSampler(sample_size, seed)

# Do sampling. Don't have to collect the results. Just exhaust the stream
for _ in stu_ans.do_sampling(TwitterStream("Data/tweets")):
    pass

sorted_counts = {emoji: stu_ans.counts[emoji] for emoji in sorted(stu_ans.counts.keys(), key=stu_ans.counts.get, reverse=True)}
print(sorted_counts)

{'Ô∏è': 14, 'üíô': 7, 'üòÇ': 4, 'ü•∞ü•∞ü•∞ü•∞': 3, 'üò≠üò≠': 3, 'üëç': 2, 'üòç': 2, 'üëë': 2, 'üíï': 1, 'üòè': 1, 'üé∂': 1, 'üòòüòò': 1, 'Ô∏èüéµ': 1, 'ü•∫ü•∫ü•∫ü•∫ü•∫ü•∞ü•∞ü•∞ü•∞ü•∞üò≠üò≠üò≠': 1, 'üò≠üò≠üò≠üò≠': 1, 'üêü': 1, 'üê∞': 1, 'üë´': 1, 'üë©\u200d': 1, 'Ô∏è\u200düíã\u200düë©': 1, 'üòà': 1, 'ü•∞üíöüèÄüèàüé§': 1, 'üò†': 1, 'üôÉüôÉ': 1, 'üåà': 1, 'üòãüòã': 1, 'üèº': 1, 'üò≠üò≠üò≠üò≠üò≠üò≠üò≠üò≠üò≠üò≠': 1, 'üò∑': 1, 'ü§¶üèª\u200d‚ôÇÔ∏è': 1, 'üòÇüòÇüòÇüòÇ': 1, 'ü§ì': 1, 'ü§∑üèø\u200d‚ôÇÔ∏è': 1, 'üî•': 1, 'üê≥': 1, 'üí∞': 1, 'ü•á': 1, 'üëèüî•': 1, 'üëâ': 1, 'üèÜüèÜüèÜ': 1, 'ü¶æ': 1, 'üòû': 1, 'üíôüì∏': 1, 'ü§ë': 1, 'üôèüèæ': 1, 'Ô∏èüòçüôÜüèΩ\u200d‚ôÇÔ∏è': 1, 'üç™': 1, 'üòá': 1, 'üòâ': 1, 'ü•¥': 1, 'üò©': 1, 'üèØ': 1, 'ü¶Ö': 1, 'üò±': 1, 'üòì': 1, 'üëá': 1, 'ü§ó': 1, 'ü•∫üíì': 1, 'üòçüòçüòçüòç': 1, 'üò§üò©': 1, 'üò≠üê±üê•üçºüíï': 1, 'üò™ü•∫': 1}
