## Setting up

In [16]:
import pandas as pd
import numpy as np
import re
import pickle
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#text preprocessing
import nltk
from nltk.tokenize import word_tokenize

from statistics import mean
import math

#from textblob import TextBlob
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger

# enable multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [76]:
def setup(fileName):
    df = pd.read_csv(f'./datasets/data_cleaned_{fileName}.csv'
                     , sep=',', encoding='utf-8')

    ref = pickle.load(open('./news_data/reference.csv', 'rb'))

    ref_df = pd.DataFrame()

    statement_list = []
    statement_sub = []

    #print(ref['coronavirus'].head(5))
    #print(ref['coronavirus'].iloc[:])

    topics = ref.keys()

    for sub in topics:
        a = ref[sub].values.tolist()
        for statements in a:
            for s in statements:
                statement_list.append(s)
        for rows in range(len(ref[sub])):
            statement_sub.append(sub)

    #print(type(statement_list[1]))
    #print(len(statement_sub))

    ref_df['statement'] = statement_list
    ref_df['subject'] = statement_sub
    ref_df = ref_df.dropna(how = 'any')
    a = np.ones(len(ref_df))
    ref_df['label'] = a
    #print(type(ref_df))
    return df, ref_df

## defining functions for feature extraction

### punctuation and symbols

In [18]:
def count_symbol(statement, symbol):
    return len(statement) - len(statement.replace(symbol, ''))

### text processing

In [19]:
# Input: String('str')
# Description: Count the number of characters in input
# Return: Character count - int ('count')

def count_char(str):
    no_space = str.replace(" ", "")
    count = len(no_space)
    return count

In [20]:
# Input: String('str')
# Description: Count the number of words in input
# Return: Word count - int ('count')

def count_word(str):
    count = len(str.split())
    return count

In [21]:
# Input: String ('str')
# Description: Count the number of sentences by counting number of period(.)
# Return: Sentence count - int ('sentence')

def count_sent(str):
    sentence = len(str.split('.'))
    return sentence

In [59]:
# Input: String ('str')
# Description: Count the number of characters in each word in input and average the number of characters per word
# Return: Average number of characters: float ('avg')

def count_char_per_word(str):
    avg = 0
    word = []
    word.append(str.split())
    char_per_word = list()
    for elements in word:
        for char in elements:
            c_in_w_count = len(char)
            char_per_word.append(c_in_w_count)
    try:
        # char_per_word_list.append()
        avg = sum(char_per_word) / len(char_per_word)
        char_per_word.clear()
    except:
        print(str)
        
    return avg

In [23]:
# Input: String ('str')
# Description: Count the words that introduced only once in input
# Return: Count of unique words - int ('unique_count')

def count_unique(str):
    words = str.split(' ')
    c = Counter(words)
    unique = [w for w in words if c[w] == 1]
    unique_counter = len(unique)
    return unique_counter

In [24]:
# Input: String ('str')
# Description: Count the number of uppercase letters
# Return: Count of uppercase letters - int ('uppercase_count')

def count_uppercase(str):
    uppercase_count = sum(1 for c in str if c.isupper())
    return uppercase_count

In [25]:
# Input: String ('str')
# Description: Count the number of month name mentioned
# Return: Count of month name - int ('month_count')

def count_month(str):
    month_count = 0
    month_list = ["January", "February", "March", "April", "May", "June"
                  , "July", "August", "September", "October", "November"
                  , "December"];
    words = str.split()
    for word in words:
        if word in month_list:
            month_count+=1
        
    return month_count

In [26]:
# sentiment score calculation
# sentiment score, polarity, subjectivity and intensity can be calculated
def sentiment_score(str):
    sentiment = TextBlob(str)
    return sentiment.sentiment.polarity

In [27]:
# Removing punctuation
import string
def remove_symbol(str):
    no_symbol = re.sub(r'[^\w\s]','',str)
    return no_symbol

### generating feature values

In [74]:
def text_feature(df):
    # counting symbols
    symbol_list = list('-?!%;:"($,.')
    for symbol in symbol_list:
        df['num_'+symbol] = df.statement.apply(lambda x: count_symbol(x, symbol))
        # remove symbols to prepare to text processing
        df_raw = df.copy() # make a copy before transforming just in case
        df.statement = df.statement.apply(lambda x: remove_symbol(x))
        
        #ref_df['num_' + symbol] = df.statement.apply(lambda x: count_symbol(x, symbol))
        #ref_df.statement = ref_df.statement.apply(lambda x: remove_symbol(x))

    # text processing that takes in statements
    feature_func = [count_char, count_word, count_sent, count_char_per_word
                , count_unique, count_uppercase, count_month
                , sentiment_score]

    for func in feature_func:
        df[func.__name__] = df.statement.apply(lambda x: func(x))
        #ref_df[func.__name__] = ref_df.statement.apply(lambda x: func(x))

    return df

### LIWC

In [29]:
def check_common(str, list):
    count = 0;
    words = str.split()
    for word in words:
        if word in list:
            count+=1
    return count

In [30]:
def tagging_univ(str):
    text = nltk.word_tokenize(str)
    tagged = nltk.pos_tag(text, tagset = 'universal')
    return tagged

In [31]:
def tagging_nuniv(str):
    text = nltk.word_tokenize(str)
    tagged = nltk.pos_tag(text)
    return tagged

In [73]:
def feature_extraction(df, fileName):
    liwc_headers = ['function','pronoun','ppron','i','we','you','shehe'
              ,'they','ipron','article','prep','auxverb','adverb'
              ,'conj','negate','verb','adj','compare','interrog'
              ,'number','quant','affect','posemo','negemo','anx','anger'
              ,'sad','social','family','friend','female','male','cogproc'
              ,'insight','cause','discrep','tentat','certain','differ'
              ,'percept','see','hear','feel','bio','body','health','sexual'
              ,'ingest','drives','affiliation','achieve','power','reward'
              ,'risk','focuspast','focuspresent','focusfuture','relativ'
              ,'motion','space','time','work','leisure','home','money'
              ,'relig','death','informal','swear','netspeak','assent'
              ,'nonflu','filler']

    liwc_dict = pd.read_csv('./datasets/LIWC_dict/LIWC_dictionary.csv'
                      , delimiter = ',', names = liwc_headers
                      , encoding = 'utf-8-sig')
    liwc_dict = liwc_dict.dropna()


    for header in liwc_headers:
        df['count_'+header] = df.statement.apply(
            lambda x: check_common(x, header))

        #ref_df['count_'+header] = ref_df.statement.apply(
            #lambda x: check_common(x, header))

    df.to_csv(f'./datasets/feature_extracted_{fileName}.csv', sep=',', encoding='utf-8', index=False)

    #ref_df.to_csv(f'./datasets/ref_feature_extracted_{fileName}.csv', sep=',', encoding='utf-8', index=False)


In [25]:
# Bag of Words
#cv = CountVectorizer(stop_words='english')
#text_cv = cv.fit_transform(df.statement.values.astype('str'))

In [26]:
# tf-idf, better than bag of words
#tfidf = TfidfVectorizer(norm=None)
#text_tfidf = tfidf.fit_transform(df.statement).toarray()

In [78]:
train_df, ref_df_train = setup('train')
test_df, ref_df_test = setup('test')
valid_df, ref_df_valid = setup('valid')

train_df = text_feature(train_df)
test_df = text_feature(test_df)
valid_df = text_feature(valid_df)

feature_extraction(train_df, 'train')
feature_extraction(test_df, 'test')
feature_extraction(valid_df, 'valid')


# Multisource features - not done yet, no need to run after this part for now

In [28]:
# Subtract each feature values for each data in the same row (one-to-one). List contains
# the extracted feature values related to texts. Ex. feature_list[0] = all feature values of first statement
# In case of there is null in the reference data, use two next reference data

def one_to_one_dif(text, ref, subject_list):
    feature_list = []
    feature = 0
    for i in range (len(text)):
        topics = subject_list.iloc[i]
        topics = topics.split(',')

        try:
            feature = (float(text[i]) - float(ref[topic[0]][i]))
        except:
            try:
                feature = (float(text[i]) - float(ref[i+2]))
            except:
                pass

        feature_list.append(round(feature, 2))
    return feature_list
    
# Subtract each feature values for each data in the same row and repeat it with different references.(one-to-many)
# Then, average the difference. List contains the extracted feature values related to texts.
# Ex. final_list[0] = all feature values of first statement
# In case of there is null in the reference data, use two next reference data

def one_to_many_dif(text, ref, subject_list):
    avg_list = []
    final_list = []
    reference = pd.DataFrame()
    for i in range(len(text)):
        topics = subject_list.iloc[i]
        topics = topics.split(',')
        for topic in topics:
            reference = pd.concat([reference, ref[topic]], ignoreIndex = True)
        for j in range(len(ref)):
            try:
                feature_val = float(text[i]) - float(ref[j])
            except:
                try:
                    feature_val = float(text[i]) - float(ref[j+2])
                except:
                    pass
            avg_list.append(feature_val)
        avg = round(mean(avg_list), 2)
        final_list.append(avg)
    return final_list
    
    
# Input: news1 (String), news2 (String), outFile (String), startFColumn (integer)
# Description: Read the data from news1 (reliable) and news2 (unreliable), and put the data in two lists. Then, subtract the news1 values in each column from news2 values.
#              The subtraction starts from startFColumn until the end of the .csv file. Next, the results will be saved in the file with given name or path from user (outFile)
#
# Return: New DataFrame ('MS_features')

def multi_source_FE(text, ref):

    # Saving original headers in header
    header = list(text)
    
    subjects = set(ref.subject)
    topic = dict()
    
    for subject in subjects:
        topic[subject] = ref.loc[ref['subject'] == subject]
        
    # Only feature values
    cols = [col for col in text.columns if col not in ['statement', 'label', 'subject']]
    
    state_text = text['statement']
    state_ref = ref['statement']
    label_text = text['label']
    subject_text = text['subject']
    data_text = text[cols]
    data_ref = ref[cols]

    feature_num = len(data_text.keys())


    # Create local lists, dataframe that are used later in this definition.
    df = pd.DataFrame()
    sub = pd.DataFrame()
    avg_sub = pd.DataFrame()
    text = []
    ref = []
    total_list = []
    new_total_list = []
    
    
    # Loop through the data for subtraction
    for i in data_text.keys():
        sub[i + "- sub"] = one_to_one_dif(data_text[i], data_ref[i], subject_text)
        avg_sub[i + "- avg sub"] = one_to_many_dif(data_text[i], data_ref[i], subject_text)
    
    # Creating dataframe with multi-sourced features
    df['statement'] = state_text
    df['label'] = label_text
    df['subject'] = subject_text
    
    # Concatenate the text, reference and features, then drop null values
    df1 = pd.concat([df, data_text, sub, avg_sub], axis = 1)
    df1 = df1.dropna(how = 'any')
    
    return df1



In [29]:
multi_source_FE(df, ref_df)

KeyError: '[\'count_unique\', \'sentiment_score\', \'count_uppercase\', \'num_"\', \'num_;\', \'count_sent\', \'count_word\', \'count_char_per_word\', \'num_(\', \'num_$\', \'num_?\', \'num_-\', \'count_month\', \'num_,\', \'num_:\', \'num_%\', \'count_char\', \'num_.\', \'num_!\'] not in index'