## Dataset Setup

In [None]:
def read_dataset(num_authors = 99):
  X = []
  y = []

  data_root_dir = "../data/corpora/amt/"
  authors_to_ignore = []
  authorCount = 0

  for author_name in os.listdir(data_root_dir):
      # Check if the maximum number of authors has been parsed
      if authorCount > self.numAuthors:
         break

      if author_name not in authors_to_ignore:
         label = author_name
         documents_path = data_root_dir + author_name + "/"
         authorCount += 1

         for doc in os.listdir(documents_path):
            if validate_file(doc):
              text = open(docPath + doc, errors = "ignore").read()
              X.append(text)
              y.append(label)

  return X, y


In [None]:
def validate_file(file_name):
    filterWords = ["imitation", "demographics", "obfuscation", "verification"]
    for fw in filterWords:
        if fw in file_name:
            return False
    return True


## Feature Extraction

### Library Imports

In [None]:
import os
import nltk
import re
import spacy
from sortedcontainers import SortedDict
from keras.preprocessing import text
import numpy as np


### Count Characters

In [None]:
def CountChars(input):
    num_chars = len(input)
    return num_chars


### Average Chars per Word

In [None]:
def averageCharacterPerWord(input):
    text_array = text.text_to_word_sequence(input, 
                                            filters=' !#$%&()*+,-./:;<=>?@[\\]^_{|}~\t\n"', 
                                            lower=False, split=" ")
    num_words = len(text_array)

    text_without_spaces = input.replace(" ", "")
    num_chars = len(text_without_spaces)

    avgCharPerWord = 1.0 * num_chars / num_words
    return avgCharPerWord


### Alphabet Frequency

In [None]:
def frequencyOfLetters(input):
    input = input.lower()  # because its case sensitive
    input = input.lower().replace(" ", "")
    num_chars = len(input)

    characters = "abcdefghijklmnopqrstuvwxyz".split()
    frequencies = []

    for each_char in characters:
      char_count = input.count(each_char)
      if char_count < 0:
        frequencies.append(0)
      else:
        frequencies.append(char_count/num_chars)

    return frequencies


### Common Bigrams

In [None]:
def CommonLetterBigramFrequency(input):

    common_bigrams = ['th','he','in','er','an','re','nd',
                      'at','on','nt','ha','es','st','en',
                      'ed','to','it','ou','ea','hi','is',
                      'or','ti','as','te','et','ng','of',
                      'al','de','se','le','sa','si','ar',
                      've','ra','ld','ur']
    bigramCounter = []
    
    input = input.lower().replace(" ", "")

    for bigram in common_bigrams:
      bigram_count = input.count(bigram)
      if bigram_count == -1:
        bigramCounter.append(0)
      else:
        bigramCounter.append(bigram_count)

    total_bigram_count = np.sum(bigramCounter)
    bigramCounterNormalized = []
    for bigram_count in bigramCounter:
      bigramCounterNormalized.append(bigram_count / total_bigram_count)

    return bigramCounterNormalized

### Common Trigrams

In [None]:
def CommonLetterTrigramFrequency(input):

    common_trigrams = ["the", "and", "ing", "her", "hat", 
                       "his", "tha", "ere", "for", "ent", 
                       "ion", "ter", "was", "you", "ith",
                       "ver", "all", "wit", "thi", "tio"]
    trigramCounter = []
    
    input = input.lower().replace(" ", "")

    for trigram in common_trigrams:
      trigram_count = input.count(trigram)
      if trigram_count == -1:
        trigramCounter.append(0)
      else:
        trigramCounter.append(trigram_count)

    total_trigram_count = np.sum(trigramCounter)
    trigramCounterNormalized = []
    for trigram_count in trigramCounter:
      trigramCounterNormalized.append(trigram_count / total_trigram_count)

    return trigramCounterNormalized


### Percentage Digits

In [None]:
def digitsPercentage(input):

    num_chars = len(input)
    num_digits = 0

    for each_char in input:
      if each_char.isnumeric():
        num_digits = num_digits + 1
    
    digit_percent = num_digits / num_chars
    return digit_percent


### Percentage Alphabets

In [None]:
def charactersPercentage(input):

    input = input.lower().replace(" ", "")
    characters = "abcdefghijklmnopqrstuvwxyz"

    total_chars = len(input)
    char_count = 0

    for each_char in input:
      if each_char in characters:
        char_count = char_count + 1
    
    char_percent = char_count / total_chars
    return char_percent


### Digits Frequency

In [None]:
def frequencyOfDigits(input):

    input = input.lower().replace(" ", "")
    num_chars = len(input)

    digits = "0123456789".split()
    frequencies = []

    for each_digit in digits:
      digit_count = input.count(each_digit)
      if digit_count < 0:
        frequencies.append(0)
      else:
        frequencies.append(digit_count/num_chars)

    return frequencies


### Percentage Upper Case Chars

In [None]:
def upperCaseCharactersPercentage(input):

    input = input.replace(" ", "")
    upper_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

    num_chars = len(input)
    upper_count = 0

    for each_char in upper_characters:
      char_count = input.count(each_char)
      if char_count > 0:
        upper_count = upper_count + char_count
        
    upper_percent = upper_count / num_chars
    return upper_percent


### Special Character Frequency

In [None]:
def frequencyOfSpecialCharacters(input):

    SPECIAL_CHARS_FILE = "static_files/writeprints_special_chars.txt"
    num_chars = len(input)
    special_counts = []

    special_characters = open(SPECIAL_CHARS_FILE , "r").readlines()
    for each_char in special_characters:
      special = each_char.strip().rstrip()
      special_count = input.count(special)
      if special_count < 0:
        special_counts.append(0)
      else:
        special_counts.append(special_count / num_chars)

    return special_counts


### Short Words

In [None]:
def CountShortWords(input):
    words = text.text_to_word_sequence(input, filters=",.?!\"'`;:-()&$", 
                                       lower=True, split=" ")
    short_word_count = 0

    for word in words:
        if len(word) <= 3:
            short_word_count = short_word_count + 1

    return short_word_count


### Word Count

In [None]:
def CountWords(input):
    words = text.text_to_word_sequence(input, filters=",.?!\"'`;:-()&$", 
                                       lower=True, split=" ")
    return len(words)


### Average Word Length

In [None]:
def averageWordLength(input):
    words = text.text_to_word_sequence(inputText, filters=",.?!\"'`;:-()&$", 
                                       lower=True, split=" ")
    lengths = []
    for word in words:
        lengths.append(len(word))
    return np.mean(lengths)


### Putting it Together

In [None]:
def calculate_features(input):
  
  features = []

  features.extend([CountWords(input)])
  features.extend([averageWordLength(input)])
  features.extend([CountShortWords(input)])
  features.extend([CountChars(input)])
  features.extend([averageCharacterPerWord(input)])
  features.extend([frequencyOfLetters(input)])
  features.extend([CommonLetterBigramFrequency(input)])
  features.extend([CommonLetterTrigramFrequency(input)])
  features.extend([digitsPercentage(input)])
  features.extend([charactersPercentage(input)])
  features.extend([frequencyOfDigits(input)])
  features.extend([upperCaseCharactersPercentage(input)])
  features.extend([frequencyOfSpecialCharacters(input)])
  return features


In [None]:
X_original, Y = read_dataset(num_authors = 6)
X_Features = []
for x in X_original:
  x_features = calculate_features(x)
  X.append(x_features)


## Authorship Attributors

In [None]:
# Import Packages
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns

# Training and Test Datasets
X_train, X_test, Y_train, Y_test = train_test_split(X_Features, Y)

# Train the model
model = RandomForestClassifier(n_estimators = 100) 
model.fit(X_train, Y_train)

# Plot the confusion matrix
Y_predicted = model.predict(X_test)
confusion = confusion_matrix(Y_test, Y_predicted)
plt.figure(figsize = (10,8))
sns.heatmap(confusion, annot = True, 
            fmt = 'd', cmap="YlGnBu")


In [None]:
def calculate_accuracy(actual, predicted):
  total_examples = len(actual)
  correct_examples = 0

  for idx in range(total_examples):
    if actual[i] == predicted[i]:
      correct_examples = correct_examples + 1
  
  accuracy = correct_examples / total_examples
  return accuracy


In [None]:
from sklearn.metrics import classification_report 
classification_report(Y_test, Y_predicted)


## Authorship Obfuscation

In [None]:
import nltk
import re
import random
import pickle
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
import WSD_with_UKB as wsd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer


### Contraction Replacement

In [None]:
def contraction_replacement(sentence):

    # Read Contractions
    CONTRACTION_FILE = 'contraction_extraction.pickle'
    with open(CONTRACTION_FILE, 'rb') as contraction_file:
        contractions = pickle.load(contraction_file)

    # Calculate contraction counts
    all_contractions = contractions.keys()
    contractions_count = 0
    for contraction in all_contractions:
        if contraction.lower() in sentence.lower():
            contractions_count += 1

    # Calculate expansion counts        
    all_expansions = contractions.values()
    expansions_count = 0
    for expansion in all_expansions:
        if expansion.lower() in sentence.lower():
            expansions_count += 1

    if contractions_count > expansions_count:
        # There are more contractions than expansions
        # So we should replace all contractions with their expansions
        temp_contractions = dict((k.lower(), v) for k, v in contractions.items())
        for contraction in all_contractions:
            if contraction.lower() in sentence.lower():
                case_insesitive = re.compile(re.escape(contraction.lower()), re.IGNORECASE)
                sentence = case_insesitive.sub(temp_contractions[contraction.lower()], sentence)
        contractions_applied = True

    elif expansions_count > contractions_count:
        # There are more expansions than contractions 
        # So we should replace expansions by contractions
        inv_map = {v: k for k, v in contractions.items()}
        temp_contractions = dict((k.lower(), v) for k, v in inv_map.items())
        for expansion in all_expansions:
            if expansion.lower() in sentence.lower():
                case_insesitive = re.compile(re.escape(expansion.lower()), re.IGNORECASE)
                sentence = case_insesitive.sub(temp_contractions[expansion.lower()], sentence)
        contractions_applied = True

    else:
        # Both expansions and contractions are equal
        # So do nothing 
        contractions_applied = False

    return sentence, contractions_applied


### Parantheses Removal

In [None]:
def remove_parenthesis(sentence):
    parantheses = ['(', ')', '{', '}', '[', ']']
    for paranthesis in parantheses:
      sentence = sentence.replace(paranthesis, "")
    return sentence


### Discourse Marker Removal

In [None]:
def remove_discourse_markers(sentence):

    # Read Discourse Markers
    DISCOURSE_FILE = 'discourse_markers.pkl'
    with open(DISCOURSE_FILE , 'rb') as discourse_file:
        discourse_markers = pickle.load(discourse_file)

    sent_tokens = sentence.lower().split()
    for marker in discourse_markers:
        if marker.lower() in sent_tokens:
            case_insensitive = re.compile(re.escape(marker.lower()), re.IGNORECASE)
            sentence = case_insensitive.sub('', sentence)

    return sentence


### Apposition Removal

In [None]:
def remove_appositions(sentence):
    sentence = re.sub(r" ?\,[^)]+\,", "", sentence)
    return sentence


### Possesive Transformation

In [None]:
def apply_possessive_transformation(text):
    if re.match(r"(\w+) of (\w+)", text):
        rnd = random.choice([False, True, False])
        if rnd:
            return re.sub(r"(\w+) of (\w+)" , r"\2's \1", text)
    return text


### Equation Transformation

In [None]:
def apply_equation_transformation(text):
    words = RegexpTokenizer(r'\w+').tokenize(text)
    symbol_to_text =   {
                '+': ' plus ',
                '-': ' minus ',
                '*': ' multiplied by ',
                '/': ' divided by ',
                '=': ' equals ',
                '>': ' greather than ',
                '<': ' less than ',
                '<=': ' less than or equal to ',
                '>=': ' greater than or equal to ',
            }
    for n,w in enumerate(words):
        for symbol in symbol_to_text:
            if symbol in w:
                words[n] = words[n].replace(symbol, symbol_to_text[sym])

    sentence = ''
    for word in words:
      sentence = sentence + word + " "

    return sentence


### Untokenization

In [None]:
def untokenize(words):
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .', '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
        "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()


### Synonym Substitution

In [None]:
def synonym_substitution(sentence, all_words):
    new_tokens = []
    output = wsd.process_text(sentence)

    for token, synset in output:
        if synset != None:
            try:
                # Get the synset name
                synset = synset.split('-')
                offset = int(synset[0])
                pos = synset[1]
                synset_name = wn.synset_from_pos_and_offset(pos, offset)

                # List of Synonyms
                synonyms = synset_name.lemma_names()

                for synonym in synonyms:
                    if synonym.lower() not in all_words:
                        token = synonym
                        break

            except Exception as e:
                # Some error in the synset naming....
                continue

        new_tokens.append(token)

    final = untokenize(new_tokens)
    final = final.capitalize()
    return final


### Putting it Together

In [None]:
def obfuscate_text(input_text):
    obfuscated_text = []
    sentences = sent_tokenize(input_text)
    tokens = set(nltk.word_tokenize(input_text.lower()))

    for sentence in sentences:
        # 1. Apply Contractions
        sentence, contractions_applied = contraction_replacement(sentence, contractions)

        # 2. Remove Parantheses
        sentence = remove_parenthesis(sentence)

        # 3. Remove Discourse Markers
        sentence = remove_discourse_markers(sentence, discourse_markers)

        # 4. Remove Appositions
        sentence = remove_appositions(sentence)

        # 5. Synonym Substitution
        sentence = synonym_substitution(sentence, tokens)

        # 6. Apply possessive transformation
        sentence = apply_possessive_transformation(sentence)

        # 7. Apply equation transformation
        sentence = apply_equation_transformation(sentence)

        obfuscated_text.append(sentence)

    obfuscated_text = " ".join(obfuscated_text)
    return obfuscated_text


### Testing Obfuscation

In [None]:
from sklearn.model_selection import train_test_split 

# Read Data
X, Y = read_dataset(num_authors = 6)

# Split it into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

# Extract features from training data
X_train_features = []
for x in X_train:
  x_features = calculate_features(x)
  X_train_features.append(x_features)

# Train the model
model = RandomForestClassifier(n_estimators = 100) 
model.fit(X_train_features, Y_train)

X_test_obfuscated = []
for x in X_test:
  # Obfuscate
  x_obfuscated = obfuscate_text(x)
  # Extract features
  x_obfuscated_features = calculate_features(x_obfuscated)
  
  X_test_obfuscated.append(x_obfuscated_features)



In [None]:
# Calculate accuracy on original
Y_pred_original = model.predict(X_test)
accuracy_orig = calculate_accuracy(Y_test, Y_pred_original)

# Calculate accuracy on obfuscated
Y_pred_obfuscated = model.predict(X_test_obfuscated)
accuracy_obf = calculate_accuracy(Y_test, Y_pred_obfuscated)
