# Importing needed libraries

In [267]:
import os
import pandas as pd
import nltk
import string
from nltk.stem.porter import *
from nltk.corpus import stopwords

# Read files

In [268]:
path = os.getcwd()
rows = []
columns = ['statement', 'fileName', 'class']
for roots, dirs, files in os.walk(path+ '/bestfriend.deception.training'):
    for file in files:
        #rows.append()
        if file.split()[0].startswith('lie'):
            doc = open(roots+'/'+file, 'r')
            doc_str = doc.read()
            rows.append([doc_str, file, 'lie'])
            doc.close()
        elif file.split()[0].startswith('true'):
            doc = open(roots+'/'+file, 'r')
            doc_str = doc.read()
            rows.append([doc_str, file,'true'])
            doc.close()

In [269]:
training_data = pd.DataFrame(rows, columns = columns)
print(training_data)

                                             statement    fileName class
0    He's a really nice person. He is always willin...    lie7.txt   lie
1    "This person is quite accomplished.  She has d...   lie12.txt   lie
2    My best friend is always there when I need her...  true60.txt  true
3    My best friend is an amazing person. He is nev...    lie8.txt   lie
4    This person is my best friend. Was always ther...   lie11.txt   lie
..                                                 ...         ...   ...
191  Cassandra is a great friend. We have grown los...   lie58.txt   lie
192  I've known my best friend for 38 years. We gre...  true76.txt  true
193  Jean is my best friend.  I have known her for ...  true83.txt  true
194  I met my best friend when I was a freshman on ...   lie72.txt   lie
195  "My best friend is very funny. He's always mak...   lie20.txt   lie

[196 rows x 3 columns]


In [270]:
def trainNaiveBayes(train_data, stop_word_rm = False, stem_rm = False):
    
    
    word_in_class_count = {}
    all_true = []
    all_lie = []
    word_in_class_probability = {}
    
    # remove punctuation, tokenize
    tokenized_list = []
    exclude = set(string.punctuation)
    for index, row in train_data.iterrows():
        punct_removed = ''.join(ch for ch in row['statement'] if ch not in exclude)
        tokenList = nltk.word_tokenize(punct_removed)
        tokenList = [word.lower() for word in tokenList]
        if stem_rm:
            p_stemmer = PorterStemmer()
            tokenList = [p_stemmer.stem(word) for word in tokenList]
            
        if stop_word_rm:
            nltk_stop_words = set(stopwords.words("english"))
            tokenList = [w for w in tokenList if w not in nltk_stop_words]
        tokenized_list.append(tokenList)
    if 'tokenizedList' not in train_data:
        train_data.insert(1, "tokenizedList", tokenized_list, False)
        
    # creat a dictionary in which the keys are our vocabulary and values are the count of the words in each 
    # class
    for index, row in train_data.iterrows():
        for word in row['tokenizedList']:
            if word not in word_in_class_count:
                if row['class'] == 'lie':
                    word_in_class_count[word] = {'lie': 1, 'true': 0}
                elif row['class'] == 'true':
                    word_in_class_count[word] = {'lie': 0, 'true': 1}
            else:
                if row['class'] == 'lie':
                    word_in_class_count[word]['lie'] += 1
                elif row['class'] == 'true':
                    word_in_class_count[word]['true'] += 1
    # number of words in class true
    for index, row in train_data.iterrows():
        for word in row['tokenizedList']:
            if row['class'] == 'lie':
                if word not in all_lie:
                    all_lie.append(word)
            elif row['class'] == 'true':
                if word not in all_true:
                    all_true.append(word)
    
    # word in class probability, add-1 smoothing
    V = len(word_in_class_count)
    for word in word_in_class_count:
        word_in_class_probability[word] = {'lie': (word_in_class_count[word]['lie'] + 1)/(len(all_lie)+V), 
                                          'true': (word_in_class_count[word]['true'] + 1)/(len(all_true)+V)}
    
    # probability of each class
    lie_count = 0
    true_count = 0
    for index, row in train_data.iterrows():
        if row['class'] == 'lie':
            lie_count+=1
        elif row['class'] == 'true':
            true_count+=1
    lie_prob = lie_count / (lie_count + true_count)
    true_prob = true_count / (lie_count + true_count)
    
    return word_in_class_probability, lie_prob, true_prob
    
            
print(len(trainNaiveBayes(training_data.copy())[0]))

2071


In [271]:
def testNaiveBayes(nbClassifier, test_statement, stop_word_rm = False, stem_rm = False):
    exclude = set(string.punctuation)
    punct_removed = ''.join(ch for ch in test_statement if ch not in exclude)
    tokenList = nltk.word_tokenize(punct_removed)
    tokenList = [word.lower() for word in tokenList]
    if stem_rm:
        p_stemmer = PorterStemmer()
        tokenList = [p_stemmer.stem(word) for word in tokenList]

    if stop_word_rm:
        nltk_stop_words = set(stopwords.words("english"))
        tokenList = [w for w in tokenList if w not in nltk_stop_words]
        
    # probability for lie class
    p_lie = 1
    for word in tokenList:
        p_lie *= nbClassifier[word]['lie']
    
    p_true = 1
    for word in tokenList:
        p_true *= nbClassifier[word]['true']
    
    if p_lie > p_true:
        return False
    else:
        return True
    

In [272]:
#len(training_data)
tp = 0
tn = 0
fp = 0
fn = 0
fileName_class = []

for i in range(len(training_data)):
    train = training_data.iloc[0:i]
    train2 = training_data.iloc[i+1:len(training_data)]
    train = pd.concat([train,train2])
    test = training_data.iloc[i]['statement']
    check = training_data.iloc[i]['class']
    fname = training_data.iloc[i]['fileName']
    classifier = trainNaiveBayes(training_data.copy())[0]
    pred_class = testNaiveBayes(classifier, test)
    fileName_class.append((fname, pred_class))
    if pred_class == True and check == 'true':
        tp += 1
    elif pred_class == True and check == 'lie':
        fp += 1
    elif pred_class == False and check == 'lie':
        tn += 1
    elif pred_class == False and check == 'true':
        fn += 1

answers = open(os.getcwd() + '/answers.txt', 'w')
print(fileName_class)
print(f'accuracy: {(tp + tn)/ (tp+fp+fn+tn)}')
answers.write(f'accuracy (stopword_removal = false, stemming = false): {(tp + tn)/ (tp+fp+fn+tn)}\n')
answers.close()

[('lie7.txt', False), ('lie12.txt', False), ('true60.txt', True), ('lie8.txt', False), ('lie11.txt', False), ('true20.txt', True), ('true31.txt', True), ('lie28.txt', False), ('lie15.txt', False), ('lie42.txt', False), ('lie26.txt', False), ('true72.txt', True), ('true33.txt', True), ('true56.txt', True), ('lie81.txt', True), ('lie23.txt', False), ('lie71.txt', True), ('true9.txt', True), ('lie24.txt', False), ('true24.txt', True), ('true85.txt', True), ('lie69.txt', False), ('lie45.txt', False), ('lie95.txt', False), ('lie91.txt', True), ('lie62.txt', True), ('true43.txt', True), ('true10.txt', True), ('true67.txt', True), ('true37.txt', True), ('lie27.txt', False), ('true14.txt', True), ('true52.txt', True), ('true66.txt', True), ('true51.txt', True), ('true79.txt', True), ('true70.txt', True), ('true21.txt', True), ('lie33.txt', False), ('lie89.txt', False), ('true11.txt', True), ('lie48.txt', False), ('lie39.txt', False), ('lie73.txt', False), ('true15.txt', True), ('true74.txt', T

# Write-up guidelines:

## Accuracy when stopwords are removed

In [273]:
#len(training_data)
tp = 0
tn = 0
fp = 0
fn = 0
fileName_class = []

for i in range(len(training_data)):
    train = training_data.iloc[0:i]
    train2 = training_data.iloc[i+1:len(training_data)]
    train = pd.concat([train,train2])
    test = training_data.iloc[i]['statement']
    check = training_data.iloc[i]['class']
    fname = training_data.iloc[i]['fileName']
    classifier = trainNaiveBayes(training_data.copy(), True, False)[0]
    pred_class = testNaiveBayes(classifier, test, True, False)
    fileName_class.append((fname, pred_class))
    if pred_class == True and check == 'true':
        tp += 1
    elif pred_class == True and check == 'lie':
        fp += 1
    elif pred_class == False and check == 'lie':
        tn += 1
    elif pred_class == False and check == 'true':
        fn += 1

answers = open(os.getcwd() + '/answers.txt', 'a')
print(f'accuracy: {(tp + tn)/ (tp+fp+fn+tn)}')
answers.write(f'accuracy (stopword_removal = True, stemming = false): {(tp + tn)/ (tp+fp+fn+tn)}\n')
answers.close()

accuracy: 0.9591836734693877


## Accuracy when stemming is applied

In [274]:
#len(training_data)
tp = 0
tn = 0
fp = 0
fn = 0
fileName_class = []

for i in range(len(training_data)):
    train = training_data.iloc[0:i]
    train2 = training_data.iloc[i+1:len(training_data)]
    train = pd.concat([train,train2])
    test = training_data.iloc[i]['statement']
    check = training_data.iloc[i]['class']
    fname = training_data.iloc[i]['fileName']
    classifier = trainNaiveBayes(training_data.copy(), False, True)[0]
    pred_class = testNaiveBayes(classifier, test, False, True)
    fileName_class.append((fname, pred_class))
    if pred_class == True and check == 'true':
        tp += 1
    elif pred_class == True and check == 'lie':
        fp += 1
    elif pred_class == False and check == 'lie':
        tn += 1
    elif pred_class == False and check == 'true':
        fn += 1

answers = open(os.getcwd() + '/answers.txt', 'a')
print(f'accuracy: {(tp + tn)/ (tp+fp+fn+tn)}')
answers.write(f'accuracy (stopword_removal = false, stemming = true): {(tp + tn)/ (tp+fp+fn+tn)}\n')
answers.close()

accuracy: 0.8061224489795918


## Accuracy when stopwords are removed and stemming is applied

In [275]:
#len(training_data)
tp = 0
tn = 0
fp = 0
fn = 0
fileName_class = []

for i in range(len(training_data)):
    train = training_data.iloc[0:i]
    train2 = training_data.iloc[i+1:len(training_data)]
    train = pd.concat([train,train2])
    test = training_data.iloc[i]['statement']
    check = training_data.iloc[i]['class']
    fname = training_data.iloc[i]['fileName']
    classifier = trainNaiveBayes(training_data.copy(), True, True)[0]
    pred_class = testNaiveBayes(classifier, test, True, True)
    fileName_class.append((fname, pred_class))
    if pred_class == True and check == 'true':
        tp += 1
    elif pred_class == True and check == 'lie':
        fp += 1
    elif pred_class == False and check == 'lie':
        tn += 1
    elif pred_class == False and check == 'true':
        fn += 1

answers = open(os.getcwd() + '/answers.txt', 'a')
print(f'accuracy: {(tp + tn)/ (tp+fp+fn+tn)}')
answers.write(f'accuracy (stopword_removal = true, stemming = true): {(tp + tn)/ (tp+fp+fn+tn)}\n')
answers.close()

accuracy: 0.9489795918367347


# Sorting probabilities
Using the implementation that does not remove stopwords and does not stem words, list the top 10 words that have the highest conditional probability (i.e., P(w|c)) in each of the two classes considered (truth, lie). Under each class, list the words in reversed order of their conditional probability.

In [294]:
probs = trainNaiveBayes(training_data.copy())[0]
print('Top 10 from class true:')
sorted_true = sorted(probs, key=lambda x:probs[x]['true'], reverse = True)
print(sorted_true[:10])
sorted_true = sorted(probs, key=lambda x:probs[x]['true'], reverse = False)
print(sorted_true[:10])

Top 10 from class true:
['and', 'i', 'we', 'to', 'the', 'is', 'my', 'she', 'me', 'a']
['nice', 'encourages', 'imagine', 'accomplished', 'career', 'attracts', 'attention', 'attractive', 'enviable', 'style']


In [296]:
print('Top 10 from class lie:')
sorted_lie = sorted(probs, key=lambda x:probs[x]['lie'], reverse = True)
print(sorted_lie[:10])
sorted_lie = sorted(probs, key=lambda x:probs[x]['lie'], reverse = False)
print(sorted_lie[:10])

Top 10 from class lie:
['and', 'to', 'she', 'is', 'i', 'a', 'the', 'he', 'me', 'her']
['comfort', 'joy', 'judges', 'read', 'shame', 'anyones', 'mad', 'judged', 'fight', 'agreement']
