In [50]:
import numpy as np
import pandas as pd
import sklearn
import os 
import re
from sklearn import preprocessing 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score
from sklearn.metrics import confusion_matrix
import nltk
from nltk import tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/embibe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [51]:
os.chdir("/home/embibe/Personal/ML/NUS/LIAR-PLUS-master")

Preparing a dictionary with word as key and its emotion as the value using the **NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt**

In [52]:
filepath = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t')
emolex_df=emolex_df.loc[emolex_df['association']==1]
emolex_dict=emolex_df[['word','emotion']].set_index('word').T.to_dict('list')

  after removing the cwd from sys.path.


List of forbidden words has been made from the forbidden.txt file provided.

In [53]:
forbidden_words=[]
with open("forbidden_words.txt","r") as f:
    for line in f:
        forbidden_words.append(line.replace('\n',''))

The given datasets have been loaded for preprocessing.

In [54]:
headers=['id','label_multiclass','statement','subject','speaker','speaker_job','state',
        'party','barely_true_counts','false_counts','half_true_counts',
        'mostly_true_counts','pants_on_fire_counts','venue','justification']

train=pd.read_csv("train2.tsv",names=headers,sep='\t')
val=pd.read_csv("val2.tsv",names=headers,sep='\t')
test=pd.read_csv("test2.tsv",names=headers,sep='\t')

The preprocess function converts to lowercase and removes any non alphabetical character from the category names. This is done because in the original dataframe same category is named in different ways. E.g. **U.S.President and us-president**

In [74]:
def preprocess(df):
    for column in categorical_columns:
        df[column]=df[column].str.replace('-', " ")
        df[column]=df[column].str.replace('.', "")
        df[column]=df[column].str.lower()
    return df

Clean Function removes the forbidden word containing statements from justifications to remove any bias.

In [57]:
def clean(x):
    sentences=tokenize.sent_tokenize(x)
    for index,sentence in enumerate(sentences):
        if any(word in sentence for word in forbidden_words):
            sentences.pop(index)
    x=' '.join(sentences)
    return x.lower()

In calculate emotion function, the emolex dictionary is used to find the emotion of a statement to be identified as true, barely_true and so on. The words which didn't find an emotion in the dictionary are given a tag "can't say" emotion. For the entire statement, a list of emotions is made corresponding to each word in the statement. The emotion that repeats the most is taken as the emotion for that statement.

In [58]:
def calculate_emotion(news_statement):
    emotion=[]
    news_statement=re.sub('[^A-Za-z0-9\s]+',"",news_statement).split()
    for word in news_statement:
        if word in emolex_dict.keys():
            emotion.append(emolex_dict[word][0])
        else:
            continue
    if(len(emotion))==0:
        return ("cant_say")
    else:
        return max(emotion,key=emotion.count)

**The multiclass labels for a statement are grouped into two i.e. False and True to give each statement a binary label along with the multiclass labels.** 

In [59]:
def convert_to_binary(df):
    df_binary=pd.DataFrame()
    df_binary['label']=df['label_multiclass']
    df_binary.loc[(df_binary['label']=='false') | (df_binary['label']=='pants-fire') | (df_binary['label']=='barely-true')]='false'
    df_binary.loc[(df_binary['label']=='half-true') | (df_binary['label']=='mostly-true') | (df_binary['label']=='true')]='true'
    return df_binary['label']

**The below two functions deal with the categorical columns. Top 98.5 percentile of the categories are taken and the rest of them are replaced with "other". Separate functions are made for train and val/test to ensure same categories in all the dataframes. Only the categories present in the train data are considered in val and test data. Any new category encountered has been replaced with "other".**

In [60]:
def replace_with_other_1(df):
    for x in categorical_columns:
        value = df[x].value_counts()
        df[x] = df[x].replace({x: 'other' for x in value[value < np.percentile(df[x].value_counts().values,98.5)].index})
    return df

def replace_with_other_2(df):
    for x in categorical_columns:
        df[x] = df[x].apply(lambda a: a if a in train_categorical[x].unique().tolist() else 'other')
    return df

In [61]:
train['justification']=train['justification'].apply(lambda x: clean(str(x)))
val['justification']=val['justification'].apply(lambda x: clean(str(x)))
test['justification']=test['justification'].apply(lambda x: clean(str(x)))

Column values with empty justification have been replaced with "unavailable" to avoid NA condition.

In [62]:
train.loc[train['justification']==""]['justification']="unavailable"
val.loc[val['justification']==""]['justification']="unavailable"
test.loc[test['justification']==""]['justification']="unavailable"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [63]:
train[train['justification'] == ''].index

Int64Index([1425, 8400, 8493], dtype='int64')

In [64]:
train.drop(['id'],axis=1,inplace=True)
val.drop(['id'],axis=1,inplace=True)
test.drop(['id'],axis=1,inplace=True)

In [65]:
train['emotion']=train['statement'].apply(lambda x: calculate_emotion(x))
val['emotion']=val['statement'].apply(lambda x: calculate_emotion(x))
test['emotion']=test['statement'].apply(lambda x: calculate_emotion(x))

In [66]:
train['label_binary']=convert_to_binary(train)
val['label_binary']=convert_to_binary(val)
test['label_binary']=convert_to_binary(test)

In [67]:
train.shape,val.shape,test.shape

((10240, 16), (1284, 16), (1267, 16))

The differnt columns in the data has been divided into numerical, categorical and text to deal with separately.

In [70]:
numerical_columns = list(train.select_dtypes(include=['float64','int64']).columns)
categorical_columns = list(train.select_dtypes(include=['object']).columns)
text_columns=['statement','justification']
categorical_columns = [column for column in categorical_columns if column not in text_columns]
categorical_columns.remove("label_multiclass")
categorical_columns.remove("label_binary")
label_multiclass=['label_multiclass']
label_binary=['label_binary']

In [75]:
train=preprocess(train)
val=preprocess(val)
test=preprocess(test)

In [76]:
train_categorical = replace_with_other_1(train[categorical_columns])
val_categorical = replace_with_other_2(val[categorical_columns])
test_categorical = replace_with_other_2(test[categorical_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [77]:
train['label_multiclass'].unique(),train['label_binary'].unique()

(array(['false', 'half-true', 'mostly-true', 'true', 'barely-true',
        'pants-fire'], dtype=object), array(['false', 'true'], dtype=object))

In [78]:
categorical_columns

['subject', 'speaker', 'speaker_job', 'state', 'party', 'venue', 'emotion']

Empty categorical columns have been replaced with "unknown" to avoid NA condition.

In [79]:
train_categorical.fillna("unknown",inplace=True)
val_categorical.fillna("unknown",inplace=True)
test_categorical.fillna("unknown",inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


All the categorical columns have been one hot encoded to convert them to 0-1 format for right training. **Didn't do label encoding as all the categories are independent and there is no inter relation.**

In [80]:
train_one_hot=pd.get_dummies(train_categorical, columns=categorical_columns, prefix=categorical_columns)
val_one_hot=pd.get_dummies(val_categorical, columns=categorical_columns, prefix=categorical_columns)
test_one_hot=pd.get_dummies(test_categorical, columns=categorical_columns, prefix=categorical_columns)

In [81]:
set(train_one_hot.columns)-set(test_one_hot.columns),set(train_one_hot.columns)-set(val_one_hot.columns)

({'party_unknown',
  'speaker_bob mcdonnell',
  'speaker_unknown',
  'subject_medicare',
  'subject_unknown',
  'venue_a video ad',
  'venue_an email',
  'venue_press release'},
 {'party_unknown',
  'speaker_rush limbaugh',
  'speaker_unknown',
  'subject_unknown',
  'venue_a newspaper article',
  'venue_a video ad',
  'venue_an article',
  'venue_an email blast'})

Get missing columns in the validation and test dataset w.r.t the train dataset. Add missing columns in test and val set with default value equal to 0. Ensure the order of column in the test and val set is in the same order as train set.

In [82]:
missing_cols = set( train_one_hot.columns ) - set( test_one_hot.columns )
for c in missing_cols:
    test_one_hot[c] = 0

test_one_hot = test_one_hot[train_one_hot.columns]

missing_cols = set( train_one_hot.columns ) - set( val_one_hot.columns )

for c in missing_cols:
    val_one_hot[c] = 0

val_one_hot = val_one_hot[train_one_hot.columns]

The multiclass labels have been label encoded as {0,1,2,3,4,5} to prepare the preprocessed dataset.
The binary labels have been label encoded as {0,1}.

In [83]:
le=preprocessing.LabelEncoder()
train['label_multiclass']=le.fit_transform(train['label_multiclass'])
val['label_multiclass']=le.transform(val['label_multiclass'])
test['label_multiclass']=le.transform(test['label_multiclass'])
train['label_binary']=le.fit_transform(train['label_binary'])
val['label_binary']=le.transform(val['label_binary'])
test['label_binary']=le.transform(test['label_binary'])

All the prepared features are concatenated together to make the final train, validation and test dataset. 

In [84]:
train_final = pd.concat([train[text_columns],train_one_hot,train[numerical_columns],train[label_multiclass],train[label_binary]],axis=1)
val_final = pd.concat([val[text_columns],val_one_hot,val[numerical_columns],val[label_multiclass],val[label_binary]],axis=1)
test_final = pd.concat([test[text_columns],test_one_hot,test[numerical_columns],test[label_multiclass],test[label_binary]],axis=1)

In [85]:
train_final.shape,val_final.shape,test_final.shape

((10240, 207), (1284, 207), (1267, 207))

It is ensured that all the Na values if any left are replaced with 0.

In [86]:
train_final.fillna(0,inplace=True)
val_final.fillna(0,inplace=True)
test_final.fillna(0,inplace=True)

Preprocessed csv's have been made to for direct access while modelling.

In [87]:
train_final.to_csv("train_preprocessed.csv",index=False)
val_final.to_csv("val_preprocessed.csv",index=False)
test_final.to_csv("test_preprocessed.csv",index=False)

In [89]:
!pip freeze > requirements_preprocessing.txt