In [None]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import os

import re
from collections import Counter
import liwc
import spacy
import spacy_transformers
import swifter
import string
import textstat

In [None]:
# Load the datasets
train_data = pd.read_csv('../BaseDataset/train.csv')
val_data = pd.read_csv('../BaseDataset/val.csv')
test_data = pd.read_csv('../BaseDataset/test.csv')

In [None]:
#Uncomment the code below if it's your first time running this code

In [None]:
#pip install en_core_web_md-3.7.1-py3-none-any.whl

In [None]:
# Load the spaCy English model
#nlp = spacy.load("en_core_web_trf")

nlp = spacy.load("en_core_web_md")

In [None]:
#Function designed to remove excess whitespaces and numbers. This is to ensure that the tokenize function only counts words.
#Numbers do not count towards words as they do not have any associated linguistic features.
def preprocess_text(text):

    # Characters to keep
    characters_to_keep = '.?'

    # Additional characters to remove
    additional_characters_to_remove = '‘’“”|@#$%^&*(;:),{<>}"[\/]+-=_~`'

    # Create a translator to remove specified characters
    translator = str.maketrans('', '', ''.join(set(string.punctuation) - 
                                               set(characters_to_keep)) + 
                                               additional_characters_to_remove)
    
    # Apply the translation to the text
    text = text.translate(translator)
    
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)

    # Remove excess whitespaces
    text = ' '.join(text.split())

    return text

### Tokenizer without Spacy
def tokenize(text):
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

In [None]:
#Tokenizer to only count words, not punctuations. This for accurate word counting. We're is counted equivalently to We Are.
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [None]:
text = " seattle (reuters) - a u.s. federal judge on monday said courtroom proceedings over president donald trump’s travel ban should continue in seattle during an ongoing appeals court review. at a hearing, u.s. district judge james robart in seattle said he was not prepared to slow down the case. robart directed attorneys for the u.s. justice department and washington state’s attorney general to prepare for further proceedings in seattle."
text = preprocess_text(text)
doc = nlp(text)
print(text)

print(textstat.flesch_kincaid_grade(text))

In [None]:
parse, category_names = liwc.load_token_parser('LIWC2007_English100131.dic')

In [None]:
#Linguistic Features based on Gravanis
def calculate_linguistic_features(text):
    text = preprocess_text(text)
    doc = nlp(text)
    total_words = len(tokenize(text))
    total_sentences = len(list(doc.sents))

    # List of third person pronouns
    third_person_words = ["he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves"]

    # Initialize counts
    third_person_count = 0

    # Set to store unique words
    unique_words = set()

    # Count occurrences of pronouns and other linguistic features
    text_tokens = tokenize(text)
    token_counts = Counter(category for token in text_tokens 
                           for category in parse(token))

    for token in doc:
        if token.lower_ in third_person_words:
            third_person_count += 1
        if token.is_alpha:
            unique_words.add(token.text.lower())

    # Count the number of adjectives, adverbs, nouns, and verbs
    adjectives = sum(1 for token in doc if token.pos_ == "ADJ")
    adverbs = sum(1 for token in doc if token.pos_ == "ADV")
    nouns = sum(1 for token in doc if token.pos_ == "NOUN")
    verbs = sum(1 for token in doc if token.pos_ == "VERB")

    # Calculate emotiveness using the formula
    emotiveness = (adjectives + adverbs) / max((nouns + verbs), 1) if nouns + verbs > 0 else 0

    # Rate of Adjectives and Adverbs
    rate_of_adj = (adjectives + adverbs) / max((total_words), 1) if total_words > 0 else 0

    #Lexical Diversity
    lexical_diversity = (len(unique_words)/ total_words) if total_words > 0 else 0

    # Retrieve counts from the LIWC dictionary
    first_person_singular_count = token_counts['i']
    exclusive_count = token_counts['excl']
    negation_count = token_counts['negate']
    causation_count = token_counts['cause']
    senses_count = token_counts['percept'] + token_counts['see'] + token_counts['hear'] + token_counts['feel']
    positive_emotions = token_counts['posemo']
    negative_emotions = token_counts['negemo']
    affective_terms = token_counts['affect']
    prepositions = token_counts['preps']
    cognitive_processes = token_counts['cogmech']
    insight_count = token_counts['insight']
    discrepancy_count = token_counts['discrep']
    tentative_count = token_counts['tentat']
    certainty_count = token_counts['certain']
    social_count = token_counts['social']
    space_count = token_counts['space']
    inclusive_count = token_counts['incl']
    motion_count = token_counts['motion']
    time_count = token_counts['time']
    past_verb = token_counts['past']
    present_verb = token_counts['present']
    future_verb = token_counts['future']
    article_count = token_counts['article']
    pronoun_count = token_counts['pronoun']
    conjunction_count = token_counts['conj']
    flesch_kincaid = textstat.flesch_kincaid_grade(text)

    return {
        'Words_per_Sentence': total_words / total_sentences if total_sentences > 0 else 0,
        'Percentage_First_Person_Singular': (first_person_singular_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Third_Person': (third_person_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Exclusive': (exclusive_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Negation': (negation_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Causation': (causation_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Sense': (senses_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_PositiveEmo': (positive_emotions / total_words) * 100 if total_words > 0 else 0,
        'Percentage_NegativeEmo': (negative_emotions / total_words) * 100 if total_words > 0 else 0,
        'Percentage_AffectiveTerms': (affective_terms / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Prepositions': (prepositions / total_words) * 100 if total_words > 0 else 0,
        'Percentage_CognitiveProcess': (cognitive_processes / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Insight': (insight_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Discrepancy': (discrepancy_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Tentative': (tentative_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Certainty': (certainty_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Social': (social_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Space': (space_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Inclusive': (inclusive_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Motion': (motion_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Time': (time_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_PastVerb': (past_verb / total_words) * 100 if total_words > 0 else 0,
        'Percentage_PresentVerb': (present_verb / total_words) * 100 if total_words > 0 else 0,
        'Percentage_FutureVerb': (future_verb / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Article': (article_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Pronoun': (pronoun_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Conjunction': (conjunction_count / total_words) * 100 if total_words > 0 else 0,
        'Lexical_Diversity': (lexical_diversity),
        'Emotiveness': (emotiveness),
        'Rate_of_Adjectives_Adverbs': (rate_of_adj),
        'Flesch_Kincaid': (flesch_kincaid),
    }

In [None]:
# Apply the function to val_data in parallel
val_data = pd.concat([val_data, val_data['text'].swifter.apply(calculate_linguistic_features).apply(pd.Series)], axis=1)

In [None]:
val_data.to_csv('val.csv', index=False)

In [None]:
# Apply the function to test_data in parallel
test_data = pd.concat([test_data, test_data['text'].swifter.apply(calculate_linguistic_features).apply(pd.Series)], axis=1)

In [None]:
test_data.to_csv('test.csv', index=False)

In [None]:
# Apply the function to train_data in parallel
train_data = pd.concat([train_data, train_data['text'].swifter.apply(calculate_linguistic_features).apply(pd.Series)], axis=1)

In [None]:
train_data.to_csv('train.csv', index=False)

In [None]:
train_data

In [None]:
train_data.value_counts("label")

In [None]:
val_data

In [None]:
val_data.value_counts("label")

In [None]:
test_data

In [None]:
test_data.value_counts("label")

In [None]:
# Load the CSV files into DataFrames
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

# Get unique IDs for each dataset
train_unique_ids = set(train_data['unique_id'])
val_unique_ids = set(val_data['unique_id'])
test_unique_ids = set(test_data['unique_id'])

# Find unique IDs in train not in val or test
train_not_in_val = train_unique_ids - val_unique_ids
train_not_in_test = train_unique_ids - test_unique_ids

# Find unique IDs in val not in train or test
val_not_in_train = val_unique_ids - train_unique_ids
val_not_in_test = val_unique_ids - test_unique_ids

# Find unique IDs in test not in train or val
test_not_in_train = test_unique_ids - train_unique_ids
test_not_in_val = test_unique_ids - val_unique_ids

print(f"Rows in train_data: {len(train_data)}")
print(f"Rows in train_data not in val_data: {len(train_not_in_val)}")
print(f"Rows in train_data not in test_data: {len(train_not_in_test)}")
print("")

print(f"Rows in train_data: {len(val_data)}")
print(f"Rows in val_data not in train_data: {len(val_not_in_train)}")
print(f"Rows in val_data not in test_data: {len(val_not_in_test)}")
print("")

print(f"Rows in test_data: {len(test_data)}")
print(f"Rows in test_data not in train_data: {len(test_not_in_train)}")
print(f"Rows in test_data not in val_data: {len(test_not_in_val)}")