In [1]:
import pandas as pd
from textblob import TextBlob
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from readability import Readability



In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
DATA_PATH = '../data/'
PREPROCESSED_DATA_FILE = "cleaned_news.csv"
RANDOM_STATE = 42

In [4]:
df = pd.read_csv(DATA_PATH + PREPROCESSED_DATA_FILE, low_memory=False)

In [5]:
df

Unnamed: 0,title,text,subject,date,label,cleaned_text,tokens_no_stopwords
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake,Donald Trump Sends Out Embarrassing New Years...,"['Donald', 'Trump', 'wish', 'Americans', 'Happ..."
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake,Drunk Bragging Trump Staffer Started Russian ...,"['House', 'Intelligence', 'Committee', 'Chairm..."
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake,Sheriff David Clarke Becomes An Internet Joke...,"['Friday', 'revealed', 'former', 'Milwaukee', ..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake,Trump Is So Obsessed He Even Has Obamas Name ...,"['Christmas', 'day', 'Donald', 'Trump', 'annou..."
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake,Pope Francis Just Called Out Donald Trump Dur...,"['Pope', 'Francis', 'used', 'annual', 'Christm..."
...,...,...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",real,Fully committed NATO backs new US approach on ...,"['BRUSSELS', 'Reuters', 'NATO', 'allies', 'Tue..."
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",real,LexisNexis withdrew two products from Chinese ...,"['LONDON', 'Reuters', 'LexisNexis', 'provider'..."
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",real,Minsk cultural hub becomes haven from authorities,"['MINSK', 'Reuters', 'shadow', 'disused', 'Sov..."
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",real,Vatican upbeat on possibility of Pope Francis ...,"['MOSCOW', 'Reuters', 'Vatican', 'Secretary', ..."


In [6]:
df = df.drop(['text','date','subject'],axis=1)

In [7]:
df

Unnamed: 0,title,label,cleaned_text,tokens_no_stopwords
0,Donald Trump Sends Out Embarrassing New Year’...,fake,Donald Trump Sends Out Embarrassing New Years...,"['Donald', 'Trump', 'wish', 'Americans', 'Happ..."
1,Drunk Bragging Trump Staffer Started Russian ...,fake,Drunk Bragging Trump Staffer Started Russian ...,"['House', 'Intelligence', 'Committee', 'Chairm..."
2,Sheriff David Clarke Becomes An Internet Joke...,fake,Sheriff David Clarke Becomes An Internet Joke...,"['Friday', 'revealed', 'former', 'Milwaukee', ..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,fake,Trump Is So Obsessed He Even Has Obamas Name ...,"['Christmas', 'day', 'Donald', 'Trump', 'annou..."
4,Pope Francis Just Called Out Donald Trump Dur...,fake,Pope Francis Just Called Out Donald Trump Dur...,"['Pope', 'Francis', 'used', 'annual', 'Christm..."
...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,real,Fully committed NATO backs new US approach on ...,"['BRUSSELS', 'Reuters', 'NATO', 'allies', 'Tue..."
44894,LexisNexis withdrew two products from Chinese ...,real,LexisNexis withdrew two products from Chinese ...,"['LONDON', 'Reuters', 'LexisNexis', 'provider'..."
44895,Minsk cultural hub becomes haven from authorities,real,Minsk cultural hub becomes haven from authorities,"['MINSK', 'Reuters', 'shadow', 'disused', 'Sov..."
44896,Vatican upbeat on possibility of Pope Francis ...,real,Vatican upbeat on possibility of Pope Francis ...,"['MOSCOW', 'Reuters', 'Vatican', 'Secretary', ..."


# Surface Information


In [8]:
def letters_per_word(text):
    words = text.split()
    if words:
        return sum(len(word) for word in words) / len(words)
    else:
        return 0

In [9]:
def count_paragraphs(text):
    return text.count('\n\n')

In [10]:
def count_sentences(text):
    return len(TextBlob(text).sentences)

In [11]:
def count_types(text):
    return len(set(text.split()))

In [12]:
def count_words(text):
    return len(text.split())

In [13]:
def words_per_sentence(text):
    sentences = TextBlob(text).sentences
    if sentences:
        return sum(len(sentence.words) for sentence in sentences) / len(sentences)
    else:
        return 0

In [14]:
def type_token_ratio(text):
    words = text.split()
    types = set(words)
    if words:
        return len(types) / len(words)
    else:
        return 0

In [15]:
df['Letters per Word'] = df['cleaned_text'].apply(letters_per_word)
df['Number of Paragraphs'] = df['cleaned_text'].apply(count_paragraphs)
df['Number of Sentences'] = df['cleaned_text'].apply(count_sentences)
df['Number of Types'] = df['cleaned_text'].apply(count_types)
df['Number of Words'] = df['cleaned_text'].apply(count_words)
df['Number of Words per Sentence'] = df['cleaned_text'].apply(words_per_sentence)
df['TTR'] = df['cleaned_text'].apply(type_token_ratio)

# Function to count parts of speech

In [16]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
def pos_counts(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    pos_counts = pd.Series([tag for word, tag in pos_tags]).value_counts()
    return pos_counts

In [18]:
pos_features = df['cleaned_text'].apply(pos_counts).fillna(0).astype(int)

In [19]:
pos_features

Unnamed: 0,NNP,VBZ,RP,NN,DT,VBG,IN,TO,RB,JJ,...,RBR,WP,WDT,UH,PDT,$,FW,'',POS,WP$
0,6,2,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8,0,0,1,2,1,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,2,0,0,0,0,0,0,2,1,...,0,0,0,0,0,0,0,0,0,0
4,7,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,3,1,0,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
44894,1,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
44895,1,0,0,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
44896,3,0,0,2,0,1,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Connectors


In [20]:
from collections import Counter
import re

In [21]:
connectors_add_info = ['and', 'also', 'besides', 'furthermore', 'in addition', 'moreover', 'as well as']
connectors_comparison = ['similarly', 'likewise', 'equally']
connectors_contrast = ['but', 'however', 'on the other hand', 'nevertheless', 'nonetheless', 'whereas', 'while', 'conversely']
connectors_emphasis = ['especially', 'particularly', 'in particular', 'notably', 'above all']
connectors_explain = ['for example', 'for instance', 'such as', 'including', 'namely']
connectors_expressing_facts = ['actually', 'in fact', 'indeed']
connectors_expressing_opinion = ['in my opinion', 'I believe', 'I think', 'personally']
connectors_reason_cause = ['because', 'since', 'as', 'due to', 'owing to', 'the reason why']
connectors_time_sequence = ['first', 'second', 'then', 'next', 'finally', 'after', 'afterward', 'before', 'previously', 'subsequently']

In [22]:
def count_connectors(text):
    counts = {
        'add_info':0,
        'comparison': 0,
        'contrast': 0,
        'emphasis': 0,
        'explain': 0,
        'expressing_facts': 0,
        'expressing_opinion': 0,
        'reason_cause': 0,
        'time_sequence': 0
    }

    for word in re.findall(r"\b\w+\b", text.lower()):
        if word in connectors_add_info:
            counts['add_info'] += 1
        if word in connectors_comparison:
            counts['comparison'] += 1
        if word in connectors_contrast:
            counts['contrast'] += 1
        if word in connectors_emphasis:
            counts['emphasis'] += 1
        if word in connectors_explain:
            counts['explain'] += 1
        if word in connectors_expressing_facts:
            counts['expressing_facts'] += 1
        if word in connectors_expressing_opinion:
            counts['expressing_opinion'] += 1
        if word in connectors_reason_cause:
            counts['reason_cause'] += 1
        if word in connectors_time_sequence:
            counts['time_sequence'] += 1
    return pd.Series(counts)

In [23]:
connector_features = df['cleaned_text'].apply(count_connectors)
df = pd.concat([df, connector_features], axis=1)


# Discursive characteristics


In [24]:
import textstat

In [25]:
def Discursive_char(text):
    tokens = word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)

    determiners = sum(1 for word, tag in tagged_tokens if tag == 'DT')
    pronouns = sum(1 for word, tag in tagged_tokens if 'PRP' in tag)
    first_person_pronouns = sum(1 for word in tokens if word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'])
    positive_words = sum(1 for word in tokens if word.lower() in ['not', 'no', 'never', 'none', "isn't", "wasn't", "aren't", "won't", "didn't", "don't", "doesn't", "haven't", "hasn't"] )
    negative_words = sum(1 for word in tokens if word.lower() in ['bad', 'worse', 'negative', 'sad'])
    negations = sum(1 for word in tokens if word.lower() in ['not', 'no', 'never', 'none', "isn't", "wasn't", "aren't", "won't", "didn't", "don't", "doesn't", "haven't", "hasn't"])
    second_person_pronouns = sum(1 for word in tokens if word.lower() in ['you', 'your', 'yours', 'yourself', 'yourselves'])
    third_person_pronouns = sum(1 for word in tokens if word.lower() in ['he', 'she', 'it', 'him', 'her', 'his', 'hers', 'its', 'himself', 'herself', 'itself', 'they', 'them', 'their', 'theirs', 'themselves'])
    swear_words = sum(1 for word in tokens if word.lower() in ['shit', "fck", "fc**k", "bitch", "st" ])
    discourse_markers = sum(1 for word in tokens if word.lower() in ['however', 'furthermore', 'moreover', 'therefore', 'thus', 'consequently', 'hence'])
    key_connectors = sum(1 for word in tokens if word.lower() in ['and', 'but', 'or', 'so', 'yet', 'for', 'nor'])
    future_regex = re.compile(r'\bwill\b|\bshall\b', re.IGNORECASE)
    mechanism_words = sum(1 for word in tokens if word.lower() in ['process', 'method', 'system', 'approach', 'technique'])
    LIWC_pronouns = sum(1 for word in tokens if word.lower() in ['I', 'me', 'my', 'mine', 'you', 'your', 'yours', 'he', 'him', 'his', 'she', 'her', 'hers', 'we', 'us', 'our', 'ours', 'they', 'them', 'their', 'theirs'] )
    LIWC_psychological = sum(1 for word in tokens if word.lower() in ['think', 'believe', 'understand', 'know', 'consider', 'remember', 'imagine'])


    ari = textstat.automated_readability_index(text)
    cli = textstat.coleman_liau_index(text)

    features = {
    'Determiners': determiners,
        'Pronouns': pronouns,
        'First Person Pronouns': first_person_pronouns,
        'Negations': negations,
        'Positive Words': positive_words,
        'Negative Words': negative_words,
        'ARI': ari,
        'CLI': cli,
        'SPP' : second_person_pronouns,
        'TPP' : third_person_pronouns,
        'Swear' : swear_words,
        'Discourse_markers': discourse_markers,
        'Key_conectors': key_connectors,
        'Future_regex': future_regex,
        'Mechanism': mechanism_words,
        'LIWC_pronouns': LIWC_pronouns,
        'LIWC_psychological': LIWC_psychological


}
    return features

In [26]:
def state_of_art(text):
    state_of_art = ['state of art', 'cutting-edge', 'latest', 'innovative', 'advanced']
    pattern = '|'.join(state_of_art)
    return len(re.findall(pattern, text.lower()))

In [27]:
def count_summary_phrases(text):
    summary_phrases = ['in summary', 'to summarize', 'overall', 'in conclusion']
    pattern = '|'.join(summary_phrases)
    return len(re.findall(pattern, text.lower()))


In [28]:
df['State of the Art Phrases'] = df['cleaned_text'].apply(state_of_art)
df['Summary Phrases'] = df['cleaned_text'].apply(count_summary_phrases)

In [29]:
state_and_summary = ['State of the Art Phrases', 'Summary Phrases']

state_and_summary_df = df[state_and_summary]

In [30]:
features_df = df['cleaned_text'].apply(lambda text: pd.Series(Discursive_char(text)))

In [31]:
features_df

Unnamed: 0,Determiners,Pronouns,First Person Pronouns,Negations,Positive Words,Negative Words,ARI,CLI,SPP,TPP,Swear,Discourse_markers,Key_conectors,Future_regex,Mechanism,LIWC_pronouns,LIWC_psychological
0,1,0,0,0,0,0,10.1,13.27,0,0,0,0,0,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",0,0,0
1,0,0,0,0,0,0,18.5,24.61,0,0,0,0,0,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",0,0,0
2,2,0,0,0,0,0,9.0,10.37,0,0,0,0,1,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",0,0,0
3,0,2,0,0,0,0,6.1,7.42,0,2,0,0,1,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",0,2,0
4,0,1,0,0,0,0,9.3,12.62,0,1,0,0,0,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,0,0,1,0,0,0,8.7,12.50,0,0,0,0,0,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",1,1,0
44894,0,0,0,0,0,0,13.0,18.16,0,0,0,0,0,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",0,0,0
44895,0,0,0,0,0,0,11.0,15.67,0,0,0,0,0,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",0,0,0
44896,0,0,0,0,0,0,10.8,15.11,0,0,0,0,0,"re.compile('\\bwill\\b|\\bshall\\b', re.IGNORE...",0,0,0


# Readability indices

In [32]:
from textstat import flesch_reading_ease, flesch_kincaid_grade, gunning_fog, smog_index, coleman_liau_index, dale_chall_readability_score, difficult_words, linsear_write_formula

In [33]:
def calculate_readability_scores(text):
    scores = {
        'Flesch Reading Ease': flesch_reading_ease(text),
        'Flesch-Kincaid Grade Level': flesch_kincaid_grade(text),
        'Gunning Fog Index': gunning_fog(text),
        'SMOG Index': smog_index(text),
        'Coleman-Liau Index': coleman_liau_index(text),
        'Dale-chall-readability': dale_chall_readability_score(text),
        'difficult_words': difficult_words(text),
        'Linsear_write_formula': linsear_write_formula(text)
    }
    return scores

In [34]:
readability_scores = df['cleaned_text'].apply(calculate_readability_scores)

In [35]:
readability_scores_df = pd.json_normalize(readability_scores)

In [36]:
combined_data = pd.concat([df, readability_scores_df], axis=1)

In [37]:
combined_data.head()

Unnamed: 0,title,label,cleaned_text,tokens_no_stopwords,Letters per Word,Number of Paragraphs,Number of Sentences,Number of Types,Number of Words,Number of Words per Sentence,...,State of the Art Phrases,Summary Phrases,Flesch Reading Ease,Flesch-Kincaid Grade Level,Gunning Fog Index,SMOG Index,Coleman-Liau Index,Dale-chall-readability,difficult_words,Linsear_write_formula
0,Donald Trump Sends Out Embarrassing New Year’...,fake,Donald Trump Sends Out Embarrassing New Years...,"['Donald', 'Trump', 'wish', 'Americans', 'Happ...",5.416667,0,1,12,12,12.0,...,0,0,59.3,8.0,11.47,0.0,13.27,12.13,3,7.0
1,Drunk Bragging Trump Staffer Started Russian ...,fake,Drunk Bragging Trump Staffer Started Russian ...,"['House', 'Intelligence', 'Committee', 'Chairm...",7.625,0,1,8,8,8.0,...,0,0,21.06,12.3,13.2,0.0,24.61,17.85,5,5.0
2,Sheriff David Clarke Becomes An Internet Joke...,fake,Sheriff David Clarke Becomes An Internet Joke...,"['Friday', 'revealed', 'former', 'Milwaukee', ...",4.866667,0,1,15,15,15.0,...,0,0,64.71,8.0,11.33,0.0,10.37,10.7,4,8.5
3,Trump Is So Obsessed He Even Has Obama’s Name...,fake,Trump Is So Obsessed He Even Has Obamas Name ...,"['Christmas', 'day', 'Donald', 'Trump', 'annou...",4.357143,0,1,14,14,14.0,...,0,0,74.19,6.4,5.6,0.0,7.42,11.1,5,6.0
4,Pope Francis Just Called Out Donald Trump Dur...,fake,Pope Francis Just Called Out Donald Trump Dur...,"['Pope', 'Francis', 'used', 'annual', 'Christm...",5.363636,0,1,11,11,11.0,...,0,0,77.23,5.2,4.4,0.0,12.62,11.36,2,4.5


In [38]:

surface_info_columns = ['Letters per Word', 'Number of Paragraphs', 'Number of Sentences',
                        'Number of Types', 'Number of Words', 'Number of Words per Sentence', 'TTR']

surface_info_df = df[surface_info_columns]

final_features_df = pd.concat([surface_info_df, features_df, pos_features, state_and_summary_df, combined_data], axis=1)

final_features_df


Unnamed: 0,Letters per Word,Number of Paragraphs,Number of Sentences,Number of Types,Number of Words,Number of Words per Sentence,TTR,Determiners,Pronouns,First Person Pronouns,...,State of the Art Phrases,Summary Phrases,Flesch Reading Ease,Flesch-Kincaid Grade Level,Gunning Fog Index,SMOG Index,Coleman-Liau Index,Dale-chall-readability,difficult_words,Linsear_write_formula
0,5.416667,0,1,12,12,12.0,1.0,1,0,0,...,0,0,59.30,8.0,11.47,0.0,13.27,12.13,3,7.0
1,7.625000,0,1,8,8,8.0,1.0,0,0,0,...,0,0,21.06,12.3,13.20,0.0,24.61,17.85,5,5.0
2,4.866667,0,1,15,15,15.0,1.0,2,0,0,...,0,0,64.71,8.0,11.33,0.0,10.37,10.70,4,8.5
3,4.357143,0,1,14,14,14.0,1.0,0,2,0,...,0,0,74.19,6.4,5.60,0.0,7.42,11.10,5,6.0
4,5.363636,0,1,11,11,11.0,1.0,0,1,0,...,0,0,77.23,5.2,4.40,0.0,12.62,11.36,2,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,5.444444,0,1,9,9,9.0,1.0,0,0,1,...,0,0,62.34,6.8,8.04,0.0,12.50,12.86,3,4.5
44894,6.571429,0,1,7,7,7.0,1.0,0,0,0,...,0,0,30.53,10.7,8.51,0.0,18.16,13.01,4,3.5
44895,6.142857,0,1,7,7,7.0,1.0,0,0,0,...,0,0,30.53,10.7,14.23,0.0,15.67,17.52,4,4.5
44896,5.888889,0,1,9,9,9.0,1.0,0,0,0,...,0,0,11.58,13.9,16.93,0.0,15.11,16.36,6,6.5


In [39]:
Features = ['cleaned_text', 'title', 'tokens_no_stopwords', 'Future_regex']
df_1 = final_features_df.drop(columns=[col for col in Features if col in final_features_df.columns])

# Saving Features to CSV


In [40]:
DATA_PATH = '../data/'  
FILENAME = 'Linguistic_features.csv' 

In [41]:
file_path = DATA_PATH + FILENAME

In [42]:
df_1.to_csv(file_path, index=False)