# **Task 1**

## Import libraries and load data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import re
from cleantext import clean
import matplotlib.pyplot as plt
import nltk
#nltk.download('all')
from nltk.probability import FreqDist
import ast

In [2]:
liar_test_data = pd.read_csv("test.tsv", header = None, sep='\t', usecols=[1,2], names=['type', 'content'])

In [3]:
# Load data as data frame
# corpusSample = pd.read_csv("250_rows.csv")

## Clean ```content``` variable

In [4]:
# The function clean_text() does this:
#   - all words will be lowercased
#   - tabs, new lines and multiple white spaces will be set to single white space
#   - numbers, dates, emails, and URLs will be replaced by "\<NUM>", "\<DATE>", "\<EMAIL>" AND "\<URL>", respectively.
def clean_text(data):

  # Set dates with format DD/MM/YYYY or MM/DD/YYYY to "<date>"
  data = re.sub("[0-9]{1,2}/[0-9]{1,2}/[0-9]{4}", "<date>", data) 

  # Set dates with the format DD/MM/YY or MM/DD/YY to "<date>"
  data = re.sub("[0-9]{1,2}/[0-9]{1,2}/[0-9]{2}", "<date>", data) 

  # Set dates with the format DD/MM/YYYY or MM/DD/YYYY to "<date>"
  data = re.sub("[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}", "<date>", data) 

  # Set dates with the format DD/MM/YY or MM/DD/YY to "<date>"
  data = re.sub("[0-9]{1,2}-[0-9]{1,2}-[0-9]{2}", "<date>", data)

  # Consider adding other date formats, like "Sept 6", "September 6, 2019", etc.

  # Use clean() for remaining cleaning
  cleaned = clean(data,
      fix_unicode=False,           # fix various unicode errors
      to_ascii=False,              # transliterate to closest ASCII representation
      lower=True,                  # lowercase text
      no_line_breaks=True,         # fully strip line breaks as opposed to only normalizing them
      no_urls=True,                # replace all URLs with a special token
      no_emails=True,              # replace all email addresses with a special token
      no_phone_numbers=False,      # replace all phone numbers with a special token
      no_numbers=True,             # replace all numbers with a special token
      no_digits=False,             # replace all digits with a special token
      no_currency_symbols=False,   # replace all currency symbols with a special token
      no_punct=False,              # remove punctuations
      replace_with_punct="",       # instead of removing punctuations you may replace them
      replace_with_url="<URL>",
      replace_with_email="<EMAIL>",
      replace_with_phone_number="<PHONE>",
      replace_with_number="<NUM>",
      replace_with_digit="0",
      replace_with_currency_symbol="<CUR>",
      lang="en"                     # set to 'de' for German special handling
  )
  return cleaned

In [5]:
liar_test_data_cleaned = liar_test_data['content'].apply(clean_text) 

## Tokenize ```content``` variable

In [6]:
# Import libraries
from nltk.tokenize import word_tokenize 
from nltk.tokenize import MWETokenizer 

# This is to make sure that "<num>", "<date>", "<email>" and "<url>" are 
# single tokens - and not "<", "num" and ">" etc.
multiWordsTokenizer = MWETokenizer([('<', 'num', '>'), ('<', 'date', '>'), ('<', 'email', '>'), ('<', 'url', '>')], separator='')

In [7]:
# Function that tokenize a string
def tokenize(data_string):

    # Word tokenize
    data_string = word_tokenize(data_string)

    # MAKE '<', 'NUM' and '>' into '<NUM>'. Same for <DATE>, <EMAIL> and <URL>:
    data_string = multiWordsTokenizer.tokenize(data_string)

    return data_string

In [8]:
liar_test_data_tokenized = liar_test_data_cleaned.apply(tokenize)

## Remove stop words

In [9]:
# Import libraries
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [10]:
# Function that removes stop words from a string
def removeStopWords(words):
    filteredWords = []

    for w in words:
        if w not in stop_words:
            filteredWords.append(w)
    return(filteredWords)

In [11]:
# Remove stop words
liar_test_data_no_stop_words = liar_test_data_tokenized.apply(removeStopWords)

## Perform stemming on ```content``` variable

In [12]:
# Import libraries
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [13]:
# Function that performs stemming on a string
def stemming(words):

    stemmedWords = []
    for w in words:
        stemmedWords.append(stemmer.stem(w))
    
    return(stemmedWords)

In [14]:
liar_test_data_stemmed = liar_test_data_no_stop_words.apply(stemming)

## Reduction rates

In [15]:
# Using FreqDist() we can see the vocabulary as well as the frequence of each token
tokens_after_tokenization = [x.strip("'") for l in liar_test_data_tokenized for x in l]
tokens_after_tokenization_vocab = FreqDist(tokens_after_tokenization)

tokens_after_removing_stop_words = [x.strip("'") for l in liar_test_data_no_stop_words for x in l]
tokens_after_removing_stop_words_vocab = FreqDist(tokens_after_removing_stop_words)

tokens_after_stemming = [x.strip("'") for l in liar_test_data_stemmed for x in l]
tokens_after_stemming_vocab = FreqDist(tokens_after_stemming)

print(f"Size of vocabulary after tokenization: {len(tokens_after_tokenization_vocab)}\n")

print(f"Size of vocabulary after removal of stop words: {len(tokens_after_removing_stop_words_vocab)}\n")

print(f"Size of vocabulary after stemming: {len(tokens_after_stemming_vocab)}\n")

print(f"Reduction rate of the vocabulary size after removing stopwords: {(len(tokens_after_tokenization_vocab)
                                    - len(tokens_after_removing_stop_words_vocab)) / 
                                    len(tokens_after_tokenization_vocab) * 100}\n")

print(f"Reduction rate of the vocabulary size after stemming: {(len(tokens_after_removing_stop_words_vocab)
                                    - len(tokens_after_stemming_vocab)) / 
                                    len(tokens_after_removing_stop_words_vocab) * 100}\n")

Size of vocabulary after tokenization: 4293

Size of vocabulary after removal of stop words: 4181

Size of vocabulary after stemming: 3244

Reduction rate of the vocabulary size after removing stopwords: 2.608898206382483

Reduction rate of the vocabulary size after stemming: 22.41090648170294



In [None]:
# Add preprocessed 'corpus' variable to corpus sample:
liar_preprocessed = liar_test_data
liar_preprocessed['content'] = liar_test_data_stemmed
liar_preprocessed.to_csv("liar_test_data_preprocessed.csv", index=False)
