In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [25]:
import pandas as pd
import base64
import numpy as np
# import imageio
import os
import scipy
import gensim
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import gensim.corpora as corpora
import itertools
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
stops = stopwords.words('english')

''' This method removes all kinds of line breaks. '''
def removeLineBreaks(tweet):
    return re.sub("\n\r|\r\n|\n|\r"," ", tweet)

''' This method removes all the url's in the tweet'''
def removeURLs(tweet):
    return re.sub("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", " ", tweet)

''' This method removes all emojis from the tweet'''
def removeEmojis(tweet):
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')
    return tweet

''' This method checks if the tweet is a retweet or not.
    a retweet contains RT @***** '''
def isRetweet(tweet):
    retweet = re.compile("RT @[A-Za-z0-9]*:")
    retweet.match(tweet)

    return bool(re.search("RT @[A-Za-z0-9]*:", tweet))

''' This method removes the retweet tag from tweets'''
def removeRTtag(tweet):
    return re.sub("RT @[A-Za-z0-9]*: ", " ", tweet)

''' This method removes all the mentions.
    mentions are usually with @'''
def removeMentions(tweet):
    return re.sub("@[A-Za-z0-9]*", " ", tweet)

''' This method removes multiple spaces.'''
def removeMultipleSpaces(tweet):
    return re.sub(" +", " ", tweet)

''' This method turns the tweets into lowercase. '''
def lowercasetweet(tweet):
    return tweet.lower()

''' This method removes all the punctuations from the tweet.'''
def removePunctuations(tweet):
    return re.sub("[.,!'\";:?…]+", " ", tweet)

''' This method removes special characters from tweets.'''
def removeSpecialCharacters(tweet):
    return re.sub("[@#$%^*(){}\\\<>\[\]~/|=\+\-&_¿ߒ]+"," ", tweet)

''' This method removes alpha-numeric charcters from the tweet.'''
def removeAlphaNumeric(tweet):
    # return re.sub("[A-Za-z]+[0-9]+", "", tweet)
    return re.sub("[0-9]+", "", tweet)

''' Lemmatization using nltk. '''
def lemmatizeTweet(tweet):
    return [WordNetLemmatizer().lemmatize(token) for token in word_tokenize(tweet)]

def cleanData(text, lowercase = False, remove_stops = False, stemming = False, lemmatization = False):
    txt = str(text)

    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")

    # Emoji replacement
    txt = re.sub(r':\)',r' happy ',txt)
    txt = re.sub(r':D',r' happy ',txt)
    txt = re.sub(r':P',r' happy ',txt)
    txt = re.sub(r':\(',r' sad ',txt)

    # Replace words like sooooooo with so
    txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
    return txt


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
final = np.load('/content/gdrive/My Drive/IR Assignment/IR_assignment.npy',allow_pickle=True)
uniq_words=[]
count={}
import os
import pickle
index={}
for line in final:
    ll = lemmatizeTweet(removeMultipleSpaces(removeURLs(removeMentions(removeEmojis(removeSpecialCharacters(removePunctuations(removeAlphaNumeric(cleanData(removeLineBreaks(line[1].lower()))))))))))
    for word in ll:
        if word not in stops:
          if word not in uniq_words:
            uniq_words.append(word)
            count[word]=1
            index[word]=[]
          else:
            count[word]+=1
          if word not in index[word]:
             index[word].append(int(line[0]))


In [0]:
pickle_out = open("/content/gdrive/My Drive/IR Assignment/inverted_index.pickle","wb")
pickle.dump(index, pickle_out)
pickle_out.close()
pickle_out = open("/content/gdrive/My Drive/IR Assignment/count_word.pickle","wb")
pickle.dump(index, pickle_out)
pickle_out.close()
np.save("/content/gdrive/My Drive/IR Assignment/uniq_words.npy",uniq_words)