In [1]:
# This script extracts socio-linguistic features such as Part-of-Speech (POS), Name-Entity-Recognition (NER), Empath and LIWC
# Extracted features are dumbed into a pickle file 'Data_Aug.pkl'

import pickle
import pandas as pd
import numpy as np
from nltk.tag import StanfordPOSTagger
from nltk.tag import StanfordNERTagger
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from nltk import word_tokenize
from empath import Empath

In [2]:
colnames=['Snippet', 'Target'] 
df = pd.read_csv('dataset.csv', names = colnames, header = None)
ppd=pd.read_csv('pre_processed_dataset.csv', encoding = 'cp1252')

In [3]:
ohe=OneHotEncoder()
lb=LabelEncoder()

In [4]:
# Using Stanford NER Tagger API
jar_n = 'stanford-ner-4.2.0/stanford-ner-2020-11-17/stanford-ner-4.2.0.jar'
model_n = 'stanford-ner-4.2.0/stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model_n, jar_n, encoding='utf8')

In [5]:
# Using Stanford POS Tagger API
jar = 'stanford-tagger-4.2.0/stanford-postagger-full-2020-11-17/stanford-postagger-4.2.0.jar'
model = 'stanford-tagger-4.2.0/stanford-postagger-full-2020-11-17/models/english-left3words-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')

In [6]:
# Extracting POS Features
POS_snippets=[]
for i in range(len(df['Snippet'])):
    POS_snippets.extend(pos_tagger.tag(word_tokenize(df['Snippet'][i])))
POS_snippets_type=[x[1] for x in POS_snippets]
POS_snippets_type=lb.fit_transform(POS_snippets_type)
pos_vec=ohe.fit_transform(np.reshape(POS_snippets_type,(-1, 1)))
pos_vec=pos_vec.todense()

In [7]:
# Extracting NER Features
ner_snippets=[]
for i in range(len(df['Snippet'])):
    ner_snippets.extend(ner_tagger.tag(word_tokenize(df['Snippet'][i])))
ner_snippets_type=[x[1] for x in ner_snippets]
ner_snippets_type=lb.fit_transform(ner_snippets_type)
ner_vec=ohe.fit_transform(np.reshape(ner_snippets_type,(-1, 1)))
ner_vec=ner_vec.todense()

In [8]:
# Extracting Empath Features
lexicon = Empath()
empath_vec=[]
for text in ppd['Candidate_words']:
    a=lexicon.analyze(text, normalize=True)
    bv=[]
    for i in a.values():
        bv.append(i)
    empath_vec.append(bv)

In [9]:
# Dumping extracted features in a pickle file 
f = open(b"Data_aug.pkl","wb")
pickle.dump(zip(pos_vec, ner_vec, empath_vec),f,protocol = 2)