In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import os

import re
from collections import Counter
import liwc
import spacy
import spacy_transformers
import swifter
import string

In [2]:
# Load the datasets
train_data = pd.read_csv('../BaseDataset/train.csv')
val_data = pd.read_csv('../BaseDataset/val.csv')
test_data = pd.read_csv('../BaseDataset/test.csv')

In [3]:
#Uncomment this code if it's your first time running this code

#pip install en_core_web_md-3.7.1-py3-none-any.whl

In [4]:
# Load the spaCy English model
#nlp = spacy.load("en_core_web_trf")

nlp = spacy.load("en_core_web_md")

In [5]:
#Function designed to remove excess whitespaces and numbers. This is to ensure that the tokenize function only counts words.
#Numbers do not count towards words as they do not have any associated linguistic features.
def preprocess_text(text):

    # Characters to keep
    characters_to_keep = '.?'

    # Additional characters to remove
    additional_characters_to_remove = '‘’“”|@#$%^&*(;:),{<>}"[\/]+-=_~`'

    # Create a translator to remove specified characters
    translator = str.maketrans('', '', ''.join(set(string.punctuation) - set(characters_to_keep)) + additional_characters_to_remove)
    
    # Apply the translation to the text
    text = text.translate(translator)
    
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)

    # Remove excess whitespaces
    text = ' '.join(text.split())

    return text

### Tokenizer without Spacy
def tokenize(text):
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

In [6]:
#Tokenizer to only count words, not punctuations. This for accurate word counting. We're is counted equivalently to We Are.
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [7]:
text = " seattle (reuters) - a u.s. federal judge on monday said courtroom proceedings over president donald trump’s travel ban should continue in seattle during an ongoing appeals court review. at a hearing, u.s. district judge james robart in seattle said he was not prepared to slow down the case. robart directed attorneys for the u.s. justice department and washington state’s attorney general to prepare for further proceedings in seattle."
text = preprocess_text(text)
print(text)

seattle reuters a u.s. federal judge on monday said courtroom proceedings over president donald trumps travel ban should continue in seattle during an ongoing appeals court review. at a hearing u.s. district judge james robart in seattle said he was not prepared to slow down the case. robart directed attorneys for the u.s. justice department and washington states attorney general to prepare for further proceedings in seattle.


In [8]:
parse, category_names = liwc.load_token_parser('LIWC2007_English100131.dic')

In [9]:
#Linguistic Features based on Hancock and Woodworth
def calculate_linguistic_features(text):
    text = preprocess_text(text)
    doc = nlp(text)
    total_words = len(tokenize(text))
    total_sentences = len(list(doc.sents))

    # List of first person singular, second person, and third person pronouns
    first_person_singular_words = ["i", "me", "my", "mine"]
    second_person_words = ["you", "your", "yours", "yourself"]
    third_person_words = ["he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves"]

    # Initialize counts
    first_person_singular_count = 0
    second_person_count = 0
    third_person_count = 0

    # Count occurrences of pronouns and other linguistic features
    text_tokens = tokenize(text)
    token_counts = Counter(category for token in text_tokens for category in parse(token))

    for token in doc:
        if token.lower_ in first_person_singular_words:
            first_person_singular_count += 1
        elif token.lower_ in second_person_words:
            second_person_count += 1
        elif token.lower_ in third_person_words:
            third_person_count += 1

    # Retrieve counts from the LIWC dictionary
    negation_count = token_counts['negate']
    exclusive_count = token_counts['excl']
    causation_count = token_counts['cause']
    senses_count = token_counts['percept'] + token_counts['see'] + token_counts['hear'] + token_counts['feel']

    return {
        'Words_per_Sentence': total_words / total_sentences if total_sentences > 0 else 0,
        'Percentage_Questions': (len([sent.text for sent in doc.sents if '?' in sent.text]) / total_sentences) * 100 if total_sentences > 0 else 0,
        'Percentage_First_Person_Singular': (first_person_singular_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Second_Person': (second_person_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Third_Person': (third_person_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Negation': (negation_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Exclusive': (exclusive_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Causation': (causation_count / total_words) * 100 if total_words > 0 else 0,
        'Percentage_Sense': (senses_count / total_words) * 100 if total_words > 0 else 0
    }

In [10]:
# Apply the function to val_data in parallel
val_data = pd.concat([val_data, val_data['text'].swifter.apply(calculate_linguistic_features).apply(pd.Series)], axis=1)

Pandas Apply:   0%|          | 0/6080 [00:00<?, ?it/s]

In [11]:
val_data.to_csv('val.csv', index=False)

In [12]:
# Apply the function to test_data in parallel
test_data = pd.concat([test_data, test_data['text'].swifter.apply(calculate_linguistic_features).apply(pd.Series)], axis=1)

Pandas Apply:   0%|          | 0/6081 [00:00<?, ?it/s]

In [13]:
test_data.to_csv('test.csv', index=False)

In [14]:
# Apply the function to train_data in parallel
train_data = pd.concat([train_data, train_data['text'].swifter.apply(calculate_linguistic_features).apply(pd.Series)], axis=1)

Pandas Apply:   0%|          | 0/48639 [00:00<?, ?it/s]

In [15]:
train_data.to_csv('train.csv', index=False)

In [16]:
train_data

Unnamed: 0,title,text,label,unique_id,Words_per_Sentence,Percentage_Questions,Percentage_First_Person_Singular,Percentage_Second_Person,Percentage_Third_Person,Percentage_Negation,Percentage_Exclusive,Percentage_Causation,Percentage_Sense
0,"iranian general, assad discuss joint military ...",beirut (reuters) - iran s military chief met w...,0,27401,28.777778,0.000000,0.000000,0.000000,1.930502,0.000000,0.000000,2.702703,5.405405
1,top u.s. official visits vietnam to assess hum...,hanoi (reuters) - a top u.s. envoy began a two...,0,41572,32.900000,0.000000,0.000000,0.000000,2.735562,0.607903,1.519757,1.519757,4.863222
2,senators want probe of allergan transfer deal ...,(reuters) - four u.s. senators have asked the ...,0,54822,30.416667,0.000000,0.000000,0.000000,2.191781,0.547945,0.821918,1.095890,6.027397
3,will the comey bombshell really shake up the 2...,first read is a morning briefing from meet the...,0,26745,21.254545,3.636364,0.085543,0.000000,4.790419,1.197605,1.710864,2.138580,3.079555
4,"egypt election in view, sisi supporters fire u...",cairo (reuters) - six months before egypt s el...,0,42204,24.444444,0.000000,0.000000,0.090909,3.181818,0.545455,1.636364,2.181818,6.181818
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48634,loretta lynch makes disturbing video encouragi...,obama s former ag loretta lynch released a vid...,1,21575,17.466667,0.000000,1.908397,0.763359,3.053435,0.381679,1.145038,0.381679,3.053435
48635,come on down to hole-suckers on southside and ...,come on down to hole-suckers on southside and ...,1,5390,23.500000,0.000000,3.723404,2.127660,1.063830,0.531915,0.531915,1.063830,1.063830
48636,please make it stop,"posted on november 7, 2016 by walter brasch wi...",1,860,25.785714,0.000000,0.000000,0.000000,2.908587,0.415512,1.800554,0.969529,0.831025
48637,report: trump’s mind shattered in the face of ...,"going into election day, trump s campaign is w...",1,15795,26.809524,4.761905,0.532860,0.177620,8.703375,0.355240,1.598579,1.598579,2.131439


In [17]:
train_data.value_counts("label")

label
0    27359
1    21280
dtype: int64

In [18]:
val_data

Unnamed: 0,title,text,label,unique_id,Words_per_Sentence,Percentage_Questions,Percentage_First_Person_Singular,Percentage_Second_Person,Percentage_Third_Person,Percentage_Negation,Percentage_Exclusive,Percentage_Causation,Percentage_Sense
0,u.s. allied syrian groups form civilian counci...,"deir al zor, syria (reuters) - u.s.-allied mil...",0,36938,28.866667,0.000000,0.000000,0.000000,2.078522,0.000000,0.461894,2.078522,1.154734
1,ga congressional dem candidate ossoff: not an ...,dem. candidate for georgia congressional seat ...,0,40217,15.666667,0.000000,4.787234,0.000000,5.851064,1.063830,1.595745,0.000000,0.000000
2,u.s. lawmakers ask trump to turn over any come...,washington (reuters) - u.s. lawmakers on sunda...,0,52849,24.709677,0.000000,0.000000,0.261097,1.958225,1.174935,2.349869,1.044386,6.527415
3,meeting between bill clinton and loretta lynch...,washington — an airport encounter this week be...,0,51808,25.076923,0.000000,1.380368,0.153374,4.447853,0.613497,1.380368,1.533742,5.674847
4,"merkel, bavaria allies agree on migrant policy...",berlin (reuters) - german chancellor angela me...,0,33337,25.000000,0.000000,0.000000,0.000000,2.000000,0.000000,1.000000,2.000000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,oops: donald trump’s debate guest supports ter...,this lack of oversight proves that donald trum...,1,139,56.166667,16.666667,0.296736,0.000000,4.154303,1.186944,1.186944,0.593472,5.341246
6076,spot on! fox sports host calls out espn’s libe...,tucker carlson responded to an espn anchor cal...,1,11500,22.900000,0.000000,0.000000,0.000000,2.183406,0.436681,0.873362,0.000000,5.240175
6077,obama commencement speech to black graduates: ...,because getting something for nothing is all t...,1,13781,18.095238,14.285714,1.842105,3.684211,5.263158,1.052632,2.105263,1.315789,5.789474
6078,comment on europe’s forgotten ‘hitler’ killed ...,black emanuelle fixed all that in 1976. attila...,1,3328,16.611354,10.043668,1.156677,2.576236,2.970557,1.445846,2.602524,1.393270,2.628812


In [19]:
val_data.value_counts("label")

label
0    3420
1    2660
dtype: int64

In [20]:
test_data

Unnamed: 0,title,text,label,unique_id,Words_per_Sentence,Percentage_Questions,Percentage_First_Person_Singular,Percentage_Second_Person,Percentage_Third_Person,Percentage_Negation,Percentage_Exclusive,Percentage_Causation,Percentage_Sense
0,"hypothetically speaking, u.s. admiral says rea...",melbourne (reuters) - the u.s. pacific fleet c...,0,55612,25.066667,0.000000,0.000000,0.265957,2.393617,0.000000,1.595745,2.393617,7.446809
1,south africa's anc says party officials barred...,johannesburg (reuters) - south africa s africa...,0,55993,23.333333,0.000000,0.000000,0.000000,2.857143,2.857143,2.857143,2.142857,2.857143
2,seattle judge says trump travel ban case shoul...,seattle (reuters) - a u.s. federal judge on mo...,0,58277,24.333333,0.000000,0.000000,0.000000,1.369863,1.369863,1.369863,0.000000,8.219178
3,outspoken lieutenant general named trump's top...,"west palm beach, fla./washington (reuters) - u...",0,52137,27.466667,0.000000,0.404531,0.242718,3.559871,0.485437,0.566343,1.779935,3.478964
4,trump calls for worldwide action against north...,seoul (reuters) - u.s. president donald trump ...,0,47382,40.666667,0.000000,0.000000,0.000000,3.278689,0.000000,0.819672,3.278689,6.557377
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6076,watch: wolf blitzer silences trump supporter’s...,trump campaign manager kellyanne conway ran in...,1,21903,26.708333,8.333333,0.468019,0.624025,4.680187,0.936037,2.184087,1.560062,1.872075
6077,mass nye sexual assaults in europe explained: ...,this is possibly the most disturbing video we ...,1,7363,32.700000,0.000000,0.000000,0.000000,1.834862,0.000000,0.305810,0.000000,4.587156
6078,"pro abortion pac, emily’s list doing its part ...",of course emily s list is going to support hil...,1,22076,25.600000,0.000000,0.000000,0.000000,4.427083,0.520833,0.781250,3.125000,2.083333
6079,trump raises concern over members of urban com...,nation puts 2016 election into perspective by ...,1,21417,63.000000,0.000000,0.000000,0.000000,5.555556,0.000000,2.777778,0.793651,2.380952


In [21]:
test_data.value_counts("label")

label
0    3420
1    2661
dtype: int64

In [22]:
# Load the CSV files into DataFrames
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
test_data = pd.read_csv('test.csv')

# Get unique IDs for each dataset
train_unique_ids = set(train_data['unique_id'])
val_unique_ids = set(val_data['unique_id'])
test_unique_ids = set(test_data['unique_id'])

# Find unique IDs in train not in val or test
train_not_in_val = train_unique_ids - val_unique_ids
train_not_in_test = train_unique_ids - test_unique_ids

# Find unique IDs in val not in train or test
val_not_in_train = val_unique_ids - train_unique_ids
val_not_in_test = val_unique_ids - test_unique_ids

# Find unique IDs in test not in train or val
test_not_in_train = test_unique_ids - train_unique_ids
test_not_in_val = test_unique_ids - val_unique_ids

print(f"Rows in train_data: {len(train_data)}")
print(f"Rows in train_data not in val_data: {len(train_not_in_val)}")
print(f"Rows in train_data not in test_data: {len(train_not_in_test)}")
print("")

print(f"Rows in train_data: {len(val_data)}")
print(f"Rows in val_data not in train_data: {len(val_not_in_train)}")
print(f"Rows in val_data not in test_data: {len(val_not_in_test)}")
print("")

print(f"Rows in test_data: {len(test_data)}")
print(f"Rows in test_data not in train_data: {len(test_not_in_train)}")
print(f"Rows in test_data not in val_data: {len(test_not_in_val)}")

Rows in train_data: 48639
Rows in train_data not in val_data: 48639
Rows in train_data not in test_data: 48639

Rows in train_data: 6080
Rows in val_data not in train_data: 6080
Rows in val_data not in test_data: 6080

Rows in test_data: 6081
Rows in test_data not in train_data: 6081
Rows in test_data not in val_data: 6081
