In [12]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords as sw
from nltk import word_tokenize, pos_tag
from urllib.request import urlopen

In [13]:
%%capture
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [14]:
def load_json_to_csv(year):
     df_raw = pd.read_json(f"datasets/gg{year}.json")
     df_raw = df_raw.sort_values(by=["timestamp_ms"], ascending=True)
     df = df_raw[["text"]].copy()
     return df
     

In [15]:
def filter_language(df, spanish_sw, english_sw):
     boolean = []
     for sent in df["text"]:
          s_count = len(set(sent.lower().split()).intersection(spanish_sw)) 
          e_count =  len(set(sent.lower().split()).intersection(english_sw))
          if s_count > e_count: boolean.append(False)
          else: boolean.append(True)
     
     return boolean
     

In [22]:

#initial cleaning of the tweets/sentences in three steps -- 
# 1)Extract the sentences if it's a normal tweet. If it's not a simple tweet, i.e, it's a retweet of a 
# tweet, then extract the tweet within that retweet.
# 2)Remove links 
# 3)Remove emojis
# 4)Remove unicode characters
# 5)Remove Apostrophe s ('s)
def preprocess_1(x):
     def fetch_rt(x):
          pattern = re.compile("(RT @\w+:)(?P<rt>.+)")
          retweet = re.match(pattern, x)
          if retweet is None: return x 
          else: 
               return retweet.group("rt")

     def remove_links(x):
          pattern = re.compile('((http:|https:)[a-zA-Z0-9\._\\/]+)')
          links = re.findall(pattern, x)
          for link in links:
               x = re.sub(link[0], " ", x)
          return x
     
     def remove_emojis(x):
            x = re.sub("[;:.=<>^/\|?*\)\(]+", "", x)
            return x

     def remove_unicode(x):
          x = x.encode("ascii", "ignore").decode()
          return x

     def remove_apostrophe(x):
          x = re.sub("('s)", "", x)
          return x  

     def remove_stopwords(x, spanish_sw, english_sw):
          pass 

     x = fetch_rt(x)
     x = remove_links(x)
     x = remove_emojis(x)
     x = remove_unicode(x)
     x = remove_apostrophe(x)
     
     return x




In [23]:
# tokenize with TweetTokenizer to separate hashtags safely
# hashtags --> information about the topic of tweet

def preprocess_2(texts, english_sw):
     tweet_tokenizer = TweetTokenizer()
     tokenized = [tweet_tokenizer.tokenize(text) for text in texts]

     def separate_hashtags(sent, stopwords):
          hashtags, text = [], []
          for word in sent:
               if "#" in word: 
                    hashtags.append(word.replace("#", "").lower())
                    if word.lower() not in stopwords: stopwords.add(word.lower())
               if word not in string.punctuation and word.lower() not in stopwords and '#' not in word and '@' not in word: text.append(word)


          return text, hashtags

     list_hashtags, list_text, stopwords = [], [], set(english_sw)

     for sent in tokenized:
          text, hashtags = separate_hashtags(sent, stopwords)
          if len(text) > 1:
               list_text.append(" ".join(text))
               list_hashtags.append(",".join(hashtags))

     df_nltk = pd.DataFrame({"text": list_text, "topic": list_hashtags})
     return df_nltk

In [24]:
def preprocess(year):
     # load raw json into csv dataset
     df = load_json_to_csv(year)
     print("..Converted json to csv.")

     # stopwords 
     spanish_sw = sw.words("spanish")
     english_sw = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y']
     
     # preprocessing 1
     df["text"] = df["text"].apply(lambda x: preprocess_1(x))
     df = df.drop_duplicates(subset=["text"], keep='first', inplace=False, ignore_index=False)
     print("Preprocessing 1...")

     # language filtering
     boolean = filter_language(df, spanish_sw, english_sw)
     print(boolean.count(False), boolean.count(True))
     df_filtered = df[boolean]
     print("Filtered english tweets..")

     # preprocessing 2
     profanities = list()
     for line in urlopen("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"):
          profanities.append(line.decode("utf-8").replace('\n',''))

     df_pp2 = preprocess_2(list(df_filtered["text"].copy()), english_sw + profanities)
     print("Preprocessing 2...")
     
     df_pp2.to_csv("datasets/dataset2.csv", index=False)
     print("Finished preprocessing.")


In [25]:
preprocess(2013)

..Converted json to csv.
Preprocessing 1...
19300 107225
Filtered english tweets..
Preprocessing 2...
Finished preprocessing.
