In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords as sw
from nltk import word_tokenize, pos_tag

In [2]:
%%capture
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [82]:
def load_json_to_csv():
     df_raw = pd.read_json("datasets/gg2015.json")
     df = df_raw[["text", "timestamp_ms"]].copy()
     df = df.rename(columns = {"timestamp_ms": "time"})
     df = df.sort_values(by=["time"], ascending=True)
     df.to_csv("datasets/dataset1.csv", index=False)
     

In [83]:

#initial cleaning of the tweets/sentences in three steps -- 
# 1)Extract the sentences if it's a normal tweet. If it's not a simple tweet, i.e, it's a retweet of a 
# tweet, then extract the tweet within that retweet.
# 2)Remove links 
# 3)Remove emojis
# 4)Remove unicode characters
# 5)Remove Apostrophe s ('s)
def preprocess_1(x):
     def fetch_rt(x):
          pattern = re.compile("(RT @\w+:)(?P<rt>.+)")
          retweet = re.match(pattern, x)
          if retweet is None: return x 
          else: 
               return retweet.group("rt")

     def remove_links(x):
          pattern = re.compile('((http:|https:)[a-zA-Z0-9\._\\/]+)')
          links = re.findall(pattern, x)
          for link in links:
               x = re.sub(link[0], " ", x)
          return x
     
     def remove_emojis(x):
            x = re.sub("[;:<>^/\|?*\)\(]+", "", x)
            return x

     def remove_unicode(x):
          x = x.encode("ascii", "ignore").decode()
          return x

     def remove_apostrophe(x):
          x = re.sub("('s)", "", x)
          return x               


     x = fetch_rt(x)
     x = remove_links(x)
     x = remove_emojis(x)
     x = remove_unicode(x)
     x = remove_apostrophe(x)
     
     return x




In [84]:
# tokenize with TweetTokenizer to separate hashtags safely
# hashtags --> information about the topic of tweet

def preprocess_2(texts):
     tweet_tokenizer = TweetTokenizer()
     tokenized = [tweet_tokenizer.tokenize(text) for text in texts]

     def separate_hashtags(sent, stopwords):
          hashtags, text = [], []
          for word in sent:
               if "#" in word: 
                    hashtags.append(word.replace("#", "").lower())
                    if word.lower() not in stopwords: stopwords.add(word.lower())
               if word not in string.punctuation and word.lower() not in stopwords and '#' not in word: text.append(word)

          return text, hashtags

     list_hashtags, list_text, stopwords = [], [], set(list(sw.words("english"))[:20] + list(sw.words("spanish")))

     for sent in tokenized:
          text, hashtags = separate_hashtags(sent, stopwords)
          if len(text) > 1:
               list_text.append(" ".join(text))
               list_hashtags.append(hashtags)

     df_nltk = pd.DataFrame({"text": list_text, "topic": list_hashtags})
     return df_nltk

In [85]:
def preprocess():
     load_json_to_csv()
     print("..Converted json to csv.")
     df = pd.read_csv("datasets/dataset1.csv")
     df["text"] = df["text"].apply(lambda x: preprocess_1(x))
     print("Preprocessing 1...")
     df = df.drop_duplicates(subset=["text"], keep='first', inplace=False, ignore_index=False)
     df_pp2 = preprocess_2(list(df["text"].copy()))
     print("Preprocessing 2...")
     df_pp2.to_csv("datasets/dataset2.csv", index=False)
     print("Finished preprocessing.")
