# Importing the Data

In [2]:
import pandas as pd

In [4]:
valid_df = pd.read_csv("valid.tsv", sep='\t', names=["Id", "Text", "Label"])
train_df = pd.read_csv("train.tsv", sep='\t', names=["Id", "Text", "Label"])
test_df = pd.read_csv("test.tsv", sep='\t', names=["Id", "Text", "Label"])
noisy_df = pd.read_csv("unlabeled_test_with_noise.tsv", sep='\t', names=["Id", "Text", "Label"])

In [5]:
train_df

Unnamed: 0,Id,Text,Label
0,Id,Text,Label
1,1241490299215634434,Official death toll from #covid19 in the Unite...,INFORMATIVE
2,1245916400981381130,"Dearest Mr. President @USER 1,169 coronavirus ...",INFORMATIVE
3,1241132432402849793,Latest Updates March 20 ⚠️5274 new cases and 3...,INFORMATIVE
4,1236107253666607104,真把公主不当干部 BREAKING: 21 people on Grand Princess...,INFORMATIVE
...,...,...,...
6932,1241325232415105025,.@USER makes major announcement in view of #co...,UNINFORMATIVE
6933,1235624084089778176,❌QUESTION ... did they receive a Ventilator wh...,INFORMATIVE
6934,1246018213995044870,CMT will air a special celebrating the life&am...,UNINFORMATIVE
6935,1239750367329439744,Current 🇮🇩 COVID19 testing procedure only test...,UNINFORMATIVE


# Some data processing

In [6]:
# Map the string label to binary values of 0 and 1
label_mapping = {"UNINFORMATIVE": 0, "INFORMATIVE": 1}

In [7]:
# apply the label mapping to all the datasets
valid_df["Label"] = valid_df["Label"].map(label_mapping)
noisy_df["Label"] = noisy_df["Label"].map(label_mapping)
train_df["Label"] = train_df["Label"].map(label_mapping)
test_df["Label"] = test_df["Label"].map(label_mapping)

In [8]:
def simple_tokenizer(text):
  """
  Simple tokenizer that lowercases and splits on whitespace.
  """
  return text.lower().split()

In [9]:
def build_vocab(texts, min_freq=1):
  """
  Build vocabulary on a list/series of text
  This allows us to convert text into a series of numbers
  This is an easier data that can be read by the model for patterns
  """
  vocab = {"<pad>": 0, "<unk>": 1}
  word_counts = {}
  for text in texts:
      tokens = simple_tokenizer(text)
      for token in tokens:
          word_counts[token] = word_counts.get(token, 0) + 1
  for token, count in word_counts.items():
      if count >= min_freq and token not in vocab:
          vocab[token] = len(vocab)
  return vocab

In [10]:
example_text = "The cat sat on a cat that sat on the fish"
example_usage = build_vocab(example_text)
example_usage

{'<pad>': 0,
 '<unk>': 1,
 't': 2,
 'h': 3,
 'e': 4,
 'c': 5,
 'a': 6,
 's': 7,
 'o': 8,
 'n': 9,
 'f': 10,
 'i': 11}

In [11]:
# Build vocabulary based on training texts
vocab = build_vocab(train_df["Text"])

# Create the Datasets

In [13]:
"""
Here we need a dataset class and a dataloader to load the data in batches.
This we see often times in our homework assignments.

"""

'\nHere we need a dataset class and a dataloader to load the data in batches.\nThis we see often times in our homework assignments.\n\n'