In [None]:
# Important libaries
import pandas as pd
import seaborn as sns
import nltk
from nltk.tokenize import WhitespaceTokenizer as w_tokenizer
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer("english")

In [None]:
df = pd.read_table("/kaggle/input/LIARbyYang/train.tsv", header = None)
df.columns = ["ID", "label", "statement", "subject", "speaker", "speaker_job", "state", "party", "pof_count", "false_count", "barelytrue_count", "halftrue_count", "mostlytrue_count", "context"]

In [None]:
df.head(10)


In [None]:
df.shape

# Merge label

In [None]:
merger = { 'pants-fire' : 0,
           'false' : 0,
           'barely-true': 0,
           'half-true': 0,
           'mostly-true': 1,
           'true': 1}
df["label"].replace(merger, inplace=True)
df.head(10)

# Clean data

In [None]:
# get rid of all periods
df['statement'].replace('\.','',regex=True,inplace=True) 
df['context'].replace('\.','',regex=True,inplace=True)

# get rid of all commas
df['statement'].replace(',','',regex=True,inplace=True) 
df['context'].replace(',','',regex=True,inplace=True) 

# clean quotations
df['statement'].replace('’','\'',regex=True,inplace=True) 
df['statement'].replace('‘','\'',regex=True,inplace=True) 
df['context'].replace('’','\'',regex=True,inplace=True) 
df['context'].replace('‘','\'',regex=True,inplace=True) 

df['statement'].replace('“','\"',regex=True,inplace=True) 
df['statement'].replace('”','\"',regex=True,inplace=True) 
df['statement'].replace('``','\"',regex=True,inplace=True) 
df['context'].replace('“','\"',regex=True,inplace=True) 
df['context'].replace('”','\"',regex=True,inplace=True) 
df['context'].replace('``','\"',regex=True,inplace=True) 

In [None]:
df.head(10)

# Bag of Words Process on Statements

In [None]:
df["statement_token"] = df["statement"].apply(nltk.word_tokenize) # tokenize statement

In [None]:
df['statement_token'].head(5)

In [None]:
df['stemmed_statement_token'] = df['statement_token'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

Vectorize tokens

In [None]:
df['stemmed_statement_token']

See the most common (and potentially useless) words for the dataset

In [None]:
# bag of words for the whole dataset
def countwords(x):
    word2count = {}
    for words in x:
        for word in words:
            if word not in word2count.keys():
                word2count[word] = 1
            else:
                word2count[word] += 1
    # sort the word2count
    sorted_word2count = sorted(word2count.items(), key=lambda x:x[1], reverse = True)
    return sorted_word2count
            
words_in_statements = countwords(df["stemmed_statement_token"])




Check for redundant words:

In [None]:
words_in_statements[0:20]

Apply CountVectorizer, ignore the first few rendundant words and words that appear only once.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x: x, max_df = 1400 ,min_df = 2) # need analyzer=lambda x: x to disable the analyzer, since the input is pandas series of lists.
X = vectorizer.fit_transform(df['stemmed_statement_token'])
new_features = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names_out()) # Get new features from countvectorizer
new_features.columns = 'statement_' + new_features.columns # add prefix to the column names


Merge new features

In [None]:
df2 = pd.concat([df,new_features],axis=1)

Do the same for context. But there are 102 null present, drop them first.

In [None]:
sum(df2['context'].isnull())

In [None]:
# drop rows with null context
df2.dropna(subset=['context'], inplace=True, ignore_index = True)

In [None]:
df2["context_token"] = df2["context"].apply(nltk.word_tokenize) # tokenize statement
df2['stemmed_context_token'] = df2['context_token'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

In [None]:
df2.shape

Check redundant:

In [None]:
words_in_context = countwords(df2["stemmed_context_token"])
print(words_in_context[0:20])

In [None]:
len(words_in_context)

Apply vectorizer:

In [None]:
cv = CountVectorizer(analyzer=lambda x: x, max_df = 1800 ,min_df = 4) # need analyzer=lambda x: x to disable the analyzer, since the input is pandas series of lists.
X2 = cv.fit_transform(df2['stemmed_context_token'])
new_features_context = pd.DataFrame(X2.toarray(), columns = cv.get_feature_names_out()) # Get new features from countvectorizer
new_features_context.columns = 'context_' + new_features_context.columns # add prefix to the column names


In [None]:
new_features_context

In [None]:
df3 = pd.concat([df2,new_features_context],axis=1)

Drop original columns:

In [None]:
df3.drop(columns=['statement', 'statement_token', 'stemmed_statement_token','context', 'context_token', 'stemmed_context_token'], inplace=True)

Do the same for subject. Now, the tokens are separated by commas in this case. Thus, change them to spaces before tokenizing them.

In [None]:
df3['subject'].replace(',',' ',regex=True,inplace=True)  # turn commas to blank spaces
df3["subject_token"] = df3["subject"].apply(nltk.word_tokenize) # tokenize statement
df3['stemmed_subject_token'] = df3['subject_token'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

Check redundant:

In [None]:
words_in_subject = countwords(df3["stemmed_subject_token"])
print(words_in_subject[0:20])

Apply CountVectorizer:

In [None]:
cvv = CountVectorizer(analyzer=lambda x: x, max_df = 1.0 ,min_df = 1) # need analyzer=lambda x: x to disable the analyzer, since the input is pandas series of lists.
X3 = cvv.fit_transform(df3['stemmed_subject_token'])
new_features_subject = pd.DataFrame(X3.toarray(), columns = cvv.get_feature_names_out()) # Get new features from countvectorizer
new_features_subject.columns = 'subject_' + new_features_subject.columns # add prefix to the column names


In [None]:
new_features_subject

Get the final DataFrame:

In [None]:
df4 = pd.concat([df3,new_features_subject],axis=1)
df4.drop(columns=['subject', 'subject_token', 'stemmed_subject_token'], inplace=True)

In [None]:
df4.shape

# Test

In [None]:
df.subject.unique().shape

In [None]:
sns.catplot(data=df3, x='subject', kind="count", palette="ch:.25",hue="label")

In [None]:
from copy import deepcopy
df_test = deepcopy(df)
df_test['subject'] = df['subject'].mask(df.groupby('subject')['subject'].transform('size').lt(60), 'Others')
df_test.drop(df_test[df_test['subject'] ==  'Others'].index, inplace = True)

In [None]:
df_test['subject'].value_counts()

In [None]:
sns.catplot(data=df_test, x='subject', kind="count", palette="ch:.25",hue="label")

Show unique value for "context"

Show all rows that contain: