In this notebook, we will use the American-British-variety classifier (https://github.com/macocu/American-British-variety-classifier) to analyse which English variety is more frequent in the dataset.

To use the classifier in Kaggle, we need to upload the lexicon.pickle file from the GitHub repository (https://github.com/macocu/American-British-variety-classifier/blob/main/lexicon.pickle) and copy the main code from https://github.com/macocu/American-British-variety-classifier/blob/main/ABClf.py

In [2]:
# Import and install all necessary libraries
!pip install parse
!pip install transliterate



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import logging
from parse import compile
from transliterate import translit
from typing import Set, List, Dict

In [12]:
# Copy the code from the ABClf.py
# in https://github.com/macocu/American-British-variety-classifier

chars_to_remove = {
    '!',
    '"',
    '#',
    '%',
    '&',
    '(',
    ')',
    '*',
    '+',
    ',',
    '-',
    '.',
    '/',
    ':',
    ';',
    '<',
    '=',
    '>',
    '?',
    '[',
    ']',
    '_',
    '`',
    '«',
    '°',
    '²',
    '³',
    'µ',
    '·',
    '»',
    '½',
    '‑',
    '–',
    '‘',
    '’',
    '“',
    '”',
    '„',
    '•',
    '…',
    '‰',
    '″',
    '₂',
    '₃',
    '€',
    '™',
    '→',
    '−',
    '∕',
    '😀',
    '😉',
    '🙁',
    '🙂'

}


def is_alpha(token: str) -> bool:
    """Checks if the input string is strictly lowercase without numerals.
    Args:
        token (str): Input text.
    Returns:
        bool: Result of checking.
    """    
    import re
    pattern = "^[a-zšđčćž]+$"
    compiled_pattern = re.compile(pattern)
    return bool(compiled_pattern.match(token))

def preprocess(s: str) -> str:
    """Removes unusual characters and lowercases the string.
    Args:
        s (str): input string.
    Returns:
        str: output string.
    """
    for c in chars_to_remove:
        s = s.replace(c, "")
    s = s.casefold()
    return s


def count_variants(s: str, lex: dict):
    """Counts the variant specific words in the preprocessed input string based on the lexicon lex.
    Returns tuple (counts, per_token_breakdown).
    Counts look like this:
        {"A":3, "B":0}.
    per_token is a dictionary with all the words detected, their counts and their variant:
        {"word1":
            {"count":3, "variant":"A"}
        }
    Args:
        s (str): Input string.
        lex (dict): Lexicon.
    Returns:
        results (tuple): (counts, per_token).
    """
    counts = dict()
    per_token = dict()
    for word in preprocess(s).split():
        if not is_alpha(word):
            continue
        variant = lex.get(word, None)
        if not variant:
            continue
        logging.debug(f"Found word {word}, presumed variant: {variant}.")
        counts[variant] = counts.get(variant, 0) + 1
        if word in per_token.keys():
            per_token[word]["count"] += 1
        else:
            per_token[word] = {"variant": variant, "count": 1}
    return counts, per_token



def counts_to_category(counts: dict) -> str:
    """Discretizes counts like {"A": 2, "B":0} to
    categories A, B, MIX, UNK.
    Args:
        counts (dict): result of count_variants function.
    Returns:
        str: category.
    """    
    A = counts.get("A", 0)
    B = counts.get("B",0)

    if A > 2*B:
        return "A"
    elif B > 2*A:
        return "B"
    elif A == B == 0:
        return "UNK"
    else:
        return "MIX"


def load_lexicon(balanced=False)-> dict:
    """Loads 'lexicon.pickle'.
    Args:
        balanced (bool, optional):
        #Whether or not to use balanced lexicon (equal number of A and B keys).
        Defaults to False. 
    Returns:
        dict: lexicon for variety identification.
    """ 
    with open(lexicon_path, "rb") as f:
        lex = pickle.load(f)
    return lex

def get_variant(text: str, lex=None) -> str:
    """Quick way to classify text. 
    Loads the lexicon, preprocesses the string. Returns the predicted 
    category {'A', 'B', 'UNK', 'MIX'}.
    Args:
        text (str): input string.
    Returns:
        str: category {'A', 'B', 'UNK', 'MIX'}
    """    
    if not lex:
        lex = load_lexicon()
    variant_detector_count = count_variants(text, lex)[0]
    return counts_to_category(variant_detector_count) 

In [None]:
# Define the path to the lexicon.pickle
lexicon_path = "/kaggle/input/english-variety-classifier-lexicon/lexicon.pickle"

In [9]:
# Import the dataset you want to analyse. If the dataset is not in the format
# that can be opened with pandas, transform it in a suitable format
# and open it in a pandas DataFrame. The text should be in the column names 'text'.

df = pd.read_csv("/kaggle/input/corewholedataset/cleaned_CORE_corpora.csv")

df.head()

In [13]:
# In the next steps, variety will be identified and various relevant information
# will be added to the dataframe.

lex = load_lexicon()

# Count how many words, detected in the text, are British, and how many American
df["variant_detector_count"] = df.text.apply(lambda s: count_variants(s, lex)[0])

# Provide more information, write out which words were detected
df["variant_detector_breakdown"] = df.text.apply(lambda s: count_variants(s, lex)[1])

# Count number of all words in the text
df["words"] = df.text.apply(lambda t: len(t.split()))

# Calculate the difference between no. of British words and no. of American words.
# If there is less than twice as many words of one variant than words of the other variant,
# text is classified as "MIX"
df["A_B"] = df.variant_detector_count.apply(lambda d:d.get("A", 0) - d.get("B", 0))

# Classify text as British (B), American (A),
# Mixed (MIX) or Unknown (UNK - contains no variety-specific word)
df["variant"] = df.variant_detector_count.apply(counts_to_category)

In [18]:
# See the resulting DataFrame
df.head()

In [22]:
# See the distribution of variety classes in the dataset
df.variant.value_counts()

In [24]:
# Let's see the percentages of the values
df.variant.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

In [31]:
# Write out the tables in markdown format
print(df.variant.value_counts().to_markdown())
