In [545]:
class PostingsWrapper():
    """
    This postings wrapper creates a link between the index dictionary and the postings list.
    """
    def __init__(self, postings_list, posting, postings_index):
        self.frequency = 1
        self.postings_index = postings_index
        postings_list.append([posting])
        

    def add_posting(self, postings_list, posting):
        """
        
        Adds a posting to the postings list, at correct index according to the term
        Only called if the term has yet not corresponding postings.
        
        :param postings_list: postings list, an attribute of the index.
        :param posting: the posting to be added, extracted from a list of tokens and docids.
        :return: returns nothing
        """
        if posting not in postings_list[self.postings_index]:
            postings_list[self.postings_index].append(posting)
            self.frequency += 1


class index:
    """
    Processes the tweets.csv file or any file containing the same structure, creates
    an inverted index. This is a dictionary terms as keys and an instance of the PostingsWrapper 
    class as value. Also creates a seperate postings list, also as an attribute, which contains
    all tweet ids where each term occured.
    """
    def __init__(self, file):
        """
        :param file: path to tweets.csv file.
        """
        self.data, self.data_index = self.preprocess(file)
        self.index, self.postings_list = self.create_index()

    def preprocess(self, file):
        """
        Opens raw text, spits it into lines comprised of six columns, stores in intermediary
        tab_seperated variable.
        Then proceeds to normalize this while transfering it to data variable. Everything is lowered
        and compared to a regex which desires to only extract usernames and tokens containing 
        only letters. All irrelevant columns are disgarded.
        
        :param file: path to tweets.csv file.
        :return: data, containing tweet IDs with corresponding tweets 
        and a dictionary of all terms and original text
        """
        import re
        
        raw_text = open(file).read()
        tab_seperated = [item.split('\t') for item in raw_text.split('\n')]

        for line in tab_seperated:
            if len(line) == 1:
                tab_seperated.remove(line)

        data = []
        data_index = {}
        for i in range(len(tab_seperated)):
            data.append([tab_seperated[i][1], tab_seperated[i][4].lower()])
            data_index[tab_seperated[i][1]] = tab_seperated[i][4]
            
        data = data

        for line in data:
            line[1] = line[1].replace('newline',' ')
            line[1] = re.sub(r'[^\w\s]+.[^\W\s]+|[^ ]+\.[^ ]+ |[^a-zA-Zäöüß\s]+ | \d+|[^\w\s]+.[^\W\s]+| https?','', line[1])
            #line[1] = re.sub('https?:\/\/[^\s]*|[^a-z\s]', '', line[1])

        return data, data_index

    def create_index(self):
        """
        Creates the index and postings list.
        :return: index, a dictionary having a unique term as key and a PostingsWrapper instance
        as value, and postings_list, a large list of lists containing all postings for each unique
        term.
        """

        # We initialize the index, the postings list, and an intermediary tokens_and_ids variable.
        index = {}
        postings_list = []
        tokens_and_ids = []

        # For each line in data, we split each tweet by whitespace into tokens.
        # As a simple preprocessing step we check to make sure that the length of each token is
        # > 0 before appending the token and its tweet ID to the tokens_and_ids list.
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        stop_words2 = set(stopwords.words('german'))
        
        for line in self.data:
            for token in [x for x in line[1].split()  if (not x in stop_words) and (not x in stop_words2)]:
                if len(token) > 1:
                    tokens_and_ids.append([token, line[0]])

        # We sort our list of all tokens.
        
        tokens_and_ids.sort()

        # The postings_index variable we initialize here will be used as we instantiate
        # PostingsWrapper objects. This integer will enable us to keep track of the index
        # of the postings list where all of a given term's postings are contained.
        
        postings_index = 0
        
        # For each line in tokens_and_ids, we check to make sure it is not already in our index.
        # If it is not we add it, create a corresponding PostingsWrapper Object that will
        # add to the postings list as it is initialized. The PostingsWrapper will also keep track
        # of frequency for us.
        # Having done this we then increment the postings_index variable by 1.
        # If it is found that the term is already present in our index, we simply add the new 
        # posting to its postings list using the PostingsWrapper.add_posting method.
        for line in tokens_and_ids:
            if line[0] not in index.keys():
                index[line[0]] = PostingsWrapper(postings_list, line[1], postings_index)
                postings_index += 1
            else:
                index[line[0]].add_posting(postings_list, line[1])

        return index, postings_list
            
    def get_frequency(self, term):
        """
        Pulls frequency from wrapper of term
        """
        try:
            return index.index[term].frequency
        except:
            print('Term not found.')
    
    def All_frequencies(self):
        '''
        return the term and frequencies in descending order
        '''
        frequencies = []
        for term in index.index.keys():
            frequencies.append((index.index[term].frequency, term))
        return sorted(frequencies)[::-1]
        
    def query_one(self, term):
        """
        Queries for a term.
        :param term: query term
        :return: postings list corresponding to query term, or error message if no results.
        """
        try:
            for posting in self.postings_list[index.index[term].postings_index]:
                print(posting, self.data_index[posting], '\n')
        except:
            print('No results for query.')

    def query_and(self, term1, term2):
        """
        Queries for the intersection of two terms.
        :param term1: first term
        :param term2: second term
        :return: returns intersection of postings lists of both terms.
        """
        
        # Here we compare the two lists and create iterators to help us compare the two postings lists
        def And(post1,post2):
            if len(post1) < len(post2):    
                iterpost1 = iter(post1)
                iterpost2 = iter(post2)
            else:
                iterpost1 = iter(post2)
                iterpost2 = iter(post1)
                
        # Here we initialize an empty intersection variable which will (hopefully) be filled.
            intersection = []
            
            current1 = next(iterpost1)
            current2 = next(iterpost2)
        # This is the loop that iterates over the members of each postings list, comparing them.
        # If there is a match it will be added to the intersection.
            while True:
                if current1 == current2:
                    intersection.append(current1)
                    try:
                        current1 = next(iterpost1)
                        current2 = next(iterpost2)
                    except:
                        break
                elif current1 < current2:
                    try:
                        current1 = next(iterpost1)
                    except:
                        break
                else:
                    try:
                        current2 = next(iterpost2)
                    except:
                        break
            # Here we print each text and id number found in intersection
            if len(intersection) != 0:
                for i in intersection:
                    print( i, self.data_index[i], '\n')
                    
            else:
                print('No results for query.')
            # Here we access the postings list for each term, assign them to variables.
#         if type(term1) == list:
#             try:
#                 postings2 = self.postings_list[index.index[term2].postings_index]
#                 return And(term1, postings2)
#             except:
#                 return None
        try:
            postings1 = self.postings_list[index.index[term1].postings_index]
            postings2 = self.postings_list[index.index[term2].postings_index]
            return And(postings1, postings2)
        except:
            print('Error: 1 or more terms not found.')
#     def query_three(self, term1, term2, term3):
#         self.query_and(self.query_and(term1,term2),term3)
    

In [546]:
renou_index = index('tweets.csv')

In [547]:
with open('renou_index3.pkl', 'wb') as f:
    pickle.dump(renou_index, f)

In [548]:
with open('renou_index3.pkl', 'rb') as f:
    index = pickle.load(f)

In [555]:
#index.query_one('fr')
#index.query_one('and')
index.query_and('nacht', 'schlafen')

1003063075333922821 einfach befriedigend mitten in der nacht aus dem offenen fenster zu sehen und zu wissen dass gerade alle schlafen, als wär man der einzige mensch den es gerade gibt 

1009499399972642816 @the_necrosis @robin_urban Mein Betreuer wusste grob wegen #CRD und #non24h Bescheid. Wann waren die Termine mit ihm? Später vormittag. Das hat mich so gestresst, dass ich die Nacht vorher noch später erst schlafen konnte als eh schon. (War ja ohne #orphanmedi)[NEWLINE]Auch sicher keine #Inklusion. 

1011758043841916930 ich kann die letzten tage wieder besser schlafen und es ist so entlastend, mal mehr als vier stunden pro nacht zu schlafen obwohl man eigentlich ausschlafen könnte 

1016073619791925250 Gute Nacht Leute ich geh dann mal Schlafen👋🏻[NEWLINE]#Faultier #Love #JaDasBinIch #BinIchNichtHübsch https://t.co/uFX9Qwwoxu 

960632815414075392 @despacitolea @Polwnn @SofiStorm22 Ich werd am Valentinstag schlafen weil ich die Nacht durchschneiden muss #goals #love 

9624732455209410

In [558]:
print(index.All_frequencies()[:5])
index.get_frequency('für')

[(5207, 'mehr'), (4493, 'happy'), (4225, 'mal'), (4164, 'really'), (3782, "i'm")]
Term not found.
