In [56]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import networkx
import itertools
import time
import re
import string
from nltk.corpus import stopwords
from collections import Counter
from matplotlib import pyplot as plt
import spacy
import seaborn as sns

In [17]:
def gulpease(text):
    text = text.strip()
    n_phrases = len(text.split('.'))-1
    n_words = len(text.split(' '))
    text = " ".join(re.split("[^a-zA-Z]*", text.lower())).strip()
    n_chars = len(text.split())
    index = 89+((300*n_phrases-(10*n_chars))/n_words)
    return index

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    space_regex = '\n'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(space_regex, ' ', parsed_text)
    #parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    #parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    #parsed_text = parsed_text.code("utf-8", errors='ignore')
    return parsed_text

In [18]:
nlp = spacy.load("it_core_news_sm")

Load Data

In [19]:
df = pd.read_csv('./news-dataset.csv') 

In [20]:
df = df.drop('Unnamed: 0', axis=1)

In [21]:
df = df[df['body_text'].notna()]

In [22]:
df['preprocess_text'] = df['body_text'].apply(preprocess)

In [23]:
df.head()

Unnamed: 0,url,publisher,publish_date,author,title,image,body_text,publish_data,reliability,preprocess_text
0,https://www.conoscenzealconfine.it/oms-italia-...,https://www.conoscenzealconfine.it,,['Conoscenze Al Confine'],OMS: “L’Italia è il banco di prova del Coronav...,https://www.conoscenzealconfine.it/wp-content/...,di Guido da Landriano\n\nL’Italia è forse la “...,,0,di Guido da Landriano L’Italia è forse la “Cav...
1,https://www.ilprimatonazionale.it/politica/cor...,https://www.ilprimatonazionale.it,2020-03-05,[],"Coronavirus, crolla la fiducia in Conte. Ora è...",https://www.ilprimatonazionale.it/wp-content/u...,"Roma, 5 mar – Per mesi e mesi Giuseppe Conte è...",,0,"Roma, 5 mar – Per mesi e mesi Giuseppe Conte è..."
2,https://www.ilprimatonazionale.it/cronaca/coro...,https://www.ilprimatonazionale.it,2020-03-05,[],"Coronavirus, i media esteri all’attacco dell’I...",https://www.ilprimatonazionale.it/wp-content/u...,"Roma, 5 mar – Con l’aggravarsi dell’emergenza ...",,0,"Roma, 5 mar – Con l’aggravarsi dell’emergenza ..."
3,https://www.ilprimatonazionale.it/cultura/coro...,https://www.ilprimatonazionale.it,2020-03-05,[],Il coronavirus un’arma da guerra biologica? Ip...,https://www.ilprimatonazionale.it/wp-content/u...,"Roma, 5 mar – La situazione sta, con tutta evi...",,0,"Roma, 5 mar – La situazione sta, con tutta evi..."
5,https://it.sputniknews.com/italia/202003088836...,https://it.sputniknews.com,,[],"Coronavirus, Italia primo Paese al mondo per t...",https://cdnit2.img.sputniknews.com/images/883/...,Al fine di migliorare il funzionamento del sit...,,0,Al fine di migliorare il funzionamento del sit...


In [24]:
df_high = df[df.reliability == 1]
df_low = df[df.reliability == 0]
print(df[df.reliability == 0].shape)
print(df[df.reliability == 1].shape)

(447, 10)
(2192, 10)


Gulpease Readability Italian Index

In [30]:
df['gulpease'] = df['preprocess_text'].apply(gulpease)

n_stopwords

In [31]:
stop = stopwords.words('italian')

In [32]:
df['fea_title_stop_nums'] = df['title'].apply(lambda x: len([x for x in x.split() if x in stop]))
df['fea_body_stop_nums'] = df['preprocess_text'].apply(lambda x: len([x for x in x.split() if x in stop]))

n_words

In [33]:
df['fea_title_word_nums'] = df['title'].apply(lambda x: len(x.split(' ')))
df['fea_body_word_nums'] = df['preprocess_text'].apply(lambda x: len(x.split(' ')))

n_nums

In [34]:
df['fea_title_num_nums'] = df['title'].apply(lambda y: len([x for x in y if x.isdigit()]))
df['fea_body_num_nums'] = df['preprocess_text'].apply(lambda y: len([x for x in y if x.isdigit()]))

n_low_chars

In [35]:
df['fea_title_chars_low_nums'] = df['title'].apply(lambda y: len([x for x in y if x.islower()]))
df['fea_body_chars_low_nums'] = df['preprocess_text'].apply(lambda y: len([x for x in y if x.islower()]))

n_upp_chars

In [36]:
df['fea_title_chars_upp_nums'] = df['title'].apply(lambda y: len([x for x in y if x.isupper()]))
df['fea_body_chars_upp_nums'] = df['preprocess_text'].apply(lambda y: len([x for x in y if x.isupper()]))

n_punct_chars

In [37]:
df['fea_title_chars_punct_nums'] = df['title'].apply(lambda y: len([x for x in y if x in string.punctuation]))
df['fea_body_chars_punct_nums'] = df['preprocess_text'].apply(lambda y: len([x for x in y if x in string.punctuation]))

n_words title+body

In [38]:
df['fea_word_nums'] = df['fea_title_word_nums'] + df['fea_body_word_nums']

density word

In [39]:
df['fea_body_word_density'] = (df['fea_body_chars_low_nums']+df['fea_body_chars_upp_nums']) / (df['fea_body_word_nums']+1)
df['fea_title_word_density'] = (df['fea_title_chars_low_nums']+df['fea_title_chars_upp_nums']) / (df['fea_title_word_nums']+1)

pos tagging

In [40]:
def check_pos_tag(x, flag):
    cnt = 0
    doc = nlp(x)
    for tup in doc:
        ppo = tup.pos_
        if ppo == flag:
            cnt += 1
    return cnt

In [41]:
df['body_noun_count'] = df['preprocess_text'].apply(lambda x: check_pos_tag(x, 'NOUN'))
df['body_verb_count'] = df['preprocess_text'].apply(lambda x: check_pos_tag(x, 'VERB'))
df['body_adj_count'] = df['preprocess_text'].apply(lambda x: check_pos_tag(x, 'ADJ'))
df['body_adv_count'] = df['preprocess_text'].apply(lambda x: check_pos_tag(x, 'ADV'))
df['body_pron_count'] = df['preprocess_text'].apply(lambda x: check_pos_tag(x, 'PRON'))

total length

In [47]:
df['fea_body_total_length'] = df['preprocess_text'].apply(len)
df['fea_title_total_length'] = df['title'].apply(len)

upp vs length

In [48]:
df['fea_body_upp_vs_length'] = df.apply(lambda row: float(row['fea_body_chars_upp_nums'])/float(row['fea_body_total_length']),axis=1)
df['fea_title_upp_vs_length'] = df.apply(lambda row: float(row['fea_title_chars_upp_nums'])/float(row['fea_title_total_length']),axis=1)

exclamation and question

In [49]:
df['fea_body_num_exclamation_marks'] = df['preprocess_text'].apply(lambda x: x.count('!'))
df['fea_body_num_question_marks'] = df['preprocess_text'].apply(lambda x: x.count('?'))

unique words

In [50]:
df['fea_body_num_unique_words'] = df['preprocess_text'].apply(lambda x: len(set(w for w in x.split())))
df['fea_body_words_vs_unique'] = df['fea_body_num_unique_words'] / df['fea_body_word_nums']
df['fea_body_word_unique_percent'] =  df['fea_body_num_unique_words']*100/df['fea_body_word_nums']

Save Feature Matrix to File

In [54]:
content_features = df[['fea_title_word_nums',
                       'fea_title_num_nums',
                       'fea_title_chars_low_nums',
                       'fea_title_chars_upp_nums',
                       'fea_title_chars_punct_nums',
                       'fea_title_stop_nums',
                       'fea_body_word_nums',
                       'fea_body_num_nums',
                       'fea_body_chars_low_nums',
                       'fea_body_chars_upp_nums',
                       'fea_body_chars_punct_nums',
                       'fea_body_stop_nums',
                       'fea_word_nums',
                       'fea_body_word_density',
                       'fea_title_word_density',
                       'body_noun_count',
                       'body_verb_count',
                       'body_adj_count',
                       'body_adv_count',
                       'body_pron_count',
                       'fea_body_total_length',
                       'fea_title_total_length',
                       'fea_body_upp_vs_length',
                       'fea_title_upp_vs_length',
                       'fea_body_num_exclamation_marks',
                       'fea_body_num_question_marks',
                       'fea_body_num_unique_words',
                       'fea_body_words_vs_unique',
                       'fea_body_word_unique_percent',
                       'gulpease',
                       'reliability'
                    ]]

content_features.to_pickle('./content-features_pandas.pkl')