## Setting up

In [30]:
import pandas as pd
import numpy as np
import re
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#text preprocessing
import nltk
from nltk.tokenize import word_tokenize

from textblob import TextBlob
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger

# enable multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [31]:
df_train = pd.read_csv('./datasets/train_cleaned.csv'
                 , sep=',', encoding='utf-8')
df_test = pd.read_csv('./datasets/test_cleaned.csv'
                 , sep=',', encoding='utf-8')
df_valid = pd.read_csv('./datasets/valid_cleaned.csv'
                 , sep=',', encoding='utf-8')

## defining functions for feature extraction

### punctuation and symbols

In [32]:
def count_symbol(statement, symbol):
    return len(statement) - len(statement.replace(symbol, ''))

### text processing

In [33]:
# Input: String('str')
# Description: Count the number of characters in input
# Return: Character count - int ('count')

def count_char(str):
    no_space = str.replace(" ", "")
    count = len(no_space)
    return count

In [34]:
# Input: String('str')
# Description: Count the number of words in input
# Return: Word count - int ('count')

def count_word(str):
    count = len(str.split())
    return count

In [35]:
# Input: String ('str')
# Description: Count the number of sentences by counting number of period(.)
# Return: Sentence count - int ('sentence')

def count_sent(str):
    sentence = len(str.split('.'))
    return sentence

In [36]:
# Input: String ('str')
# Description: Count the number of characters in each word in input and average the number of characters per word
# Return: Average number of characters: float ('avg')

def count_char_per_word(str):
    word = []
    word.append(str.split())
    char_per_word = list()
    for elements in word:
        for char in elements:
            c_in_w_count = len(char)
            char_per_word.append(c_in_w_count)
    # char_per_word_list.append()
    avg = sum(char_per_word) / len(char_per_word)
    char_per_word.clear()
    return avg

In [37]:
# Input: String ('str')
# Description: Count the words that introduced only once in input
# Return: Count of unique words - int ('unique_count')

def count_unique(str):
    words = str.split(' ')
    c = Counter(words)
    unique = [w for w in words if c[w] == 1]
    unique_counter = len(unique)
    return unique_counter

In [38]:
# Input: String ('str')
# Description: Count the number of uppercase letters
# Return: Count of uppercase letters - int ('uppercase_count')

def count_uppercase(str):
    uppercase_count = sum(1 for c in str if c.isupper())
    return uppercase_count

In [39]:
# Input: String ('str')
# Description: Count the number of month name mentioned
# Return: Count of month name - int ('month_count')

def count_month(str):
    month_count = 0
    month_list = ["January", "February", "March", "April", "May", "June"
                  , "July", "August", "September", "October", "November"
                  , "December"];
    words = str.split()
    for word in words:
        if word in month_list:
            month_count+=1
        
    return month_count

In [40]:
# sentiment score calculation
# sentiment score, polarity, subjectivity and intensity can be calculated
def sentiment_score(str):
    sentiment = TextBlob(str)
    return sentiment.sentiment.polarity

In [41]:
# Removing punctuation
import string
def remove_symbol(str):
    no_symbol = re.sub(r'[^\w\s]','',str)
    return no_symbol

### generating feature values

In [42]:
# counting symbols
symbol_list = list('-?!%;:"($,.')
for symbol in symbol_list:
    df_train['num_'+symbol] = df_train.statement.apply(lambda x: count_symbol(x, symbol))
    df_test['num_'+symbol] = df_test.statement.apply(lambda x: count_symbol(x, symbol))
    df_valid['num_'+symbol] = df_valid.statement.apply(lambda x: count_symbol(x, symbol))
    

In [43]:
# remove symbols to prepare to text processing
df_train.statement = df_train.statement.apply(lambda x: remove_symbol(x))
df_test.statement = df_test.statement.apply(lambda x: remove_symbol(x))
df_valid.statement = df_valid.statement.apply(lambda x: remove_symbol(x))

In [44]:
# text processing that takes in statements
feature_func = [count_char, count_word, count_sent, count_char_per_word
                , count_unique, count_uppercase, count_month
                , sentiment_score]
for func in feature_func:
    df_train[func.__name__] = df_train.statement.apply(lambda x: func(x))
    df_test[func.__name__] = df_test.statement.apply(lambda x: func(x))
    df_valid[func.__name__] = df_valid.statement.apply(lambda x: func(x))

### LIWC

In [45]:
def check_common(str, list):
    count = 0;
    words = str.split()
    for word in words:
        if word in list:
            count+=1
    return count

In [46]:
def tagging_univ(str):
    text = nltk.word_tokenize(str)
    tagged = nltk.pos_tag(text, tagset = 'universal')
    return tagged

In [47]:
def tagging_nuniv(str):
    text = nltk.word_tokenize(str)
    tagged = nltk.pos_tag(text)
    return tagged

In [48]:
liwc_headers = ['function','pronoun','ppron','i','we','you','shehe'
          ,'they','ipron','article','prep','auxverb','adverb'
          ,'conj','negate','verb','adj','compare','interrog'
          ,'number','quant','affect','posemo','negemo','anx','anger'
          ,'sad','social','family','friend','female','male','cogproc'
          ,'insight','cause','discrep','tentat','certain','differ'
          ,'percept','see','hear','feel','bio','body','health','sexual'
          ,'ingest','drives','affiliation','achieve','power','reward'
          ,'risk','focuspast','focuspresent','focusfuture','relativ'
          ,'motion','space','time','work','leisure','home','money'
          ,'relig','death','informal','swear','netspeak','assent'
          ,'nonflu','filler']

liwc_dict = pd.read_csv('./datasets/LIWC_dict/LIWC_dictionary.csv'
                  , delimiter = ',', names = liwc_headers
                  , encoding = 'utf-8-sig')
liwc_dict = liwc_dict.dropna()

In [49]:
for header in liwc_headers:
    df_train['count_'+header] = df_train.statement.apply(
        lambda x: check_common(x, header))
    
    df_test['count_'+header] = df_test.statement.apply(
        lambda x: check_common(x, header))
    
    df_valid['count_'+header] = df_valid.statement.apply(
        lambda x: check_common(x, header))
    

# Bag of Words
cv = CountVectorizer(stop_words='english')
text_cv = cv.fit_transform(df.statement.values.astype('str'))

# tf-idf, better than bag of words
tfidf = TfidfVectorizer(norm=None)
text_tfidf = tfidf.fit_transform(df.statement).toarray()

In [50]:
df_train.to_csv('./datasets/train_extracted.csv', sep=',', encoding='utf-8', index=False)
df_test.to_csv('./datasets/test_extracted.csv', sep=',', encoding='utf-8', index=False)
df_valid.to_csv('./datasets/valid_extracted.csv', sep=',', encoding='utf-8', index=False)