In [131]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.svm import LinearSVC, OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import eli5
import nltk
from sklearn.pipeline import Pipeline, FeatureUnion
np.random.seed(500)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/borisdejong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [132]:
df = pd.read_pickle('../data/v2.0-processed/df_premises.pickle')
df_test = pd.read_pickle('../data/v2.0-processed/df_premises_test.pickle')
df.head()
df_test.head()

Unnamed: 0,document,file_path,split,premise,type,logos,pathos,ethos
0,89,v2.0/positive/89.xml,positive,I'm Unitarian Universalist,ethos,False,False,True
1,89,v2.0/positive/89.xml,positive,Had a group of religions identical in all resp...,logos,True,False,False
2,89,v2.0/positive/89.xml,positive,Whatever other religion had ended up developin...,logos,True,False,False
3,89,v2.0/positive/89.xml,positive,a large part of this view came from something ...,ethos,False,False,True
4,89,v2.0/positive/89.xml,positive,Christianity was primarily spread by the Roman...,logos,True,False,False


In [133]:
# Lowercase
df['premise'] = df['premise'].str.lower()
df['logos'] = df['logos'].map({True: 'Logos', False: 'Not Logos'})
df['pathos'] = df['pathos'].map({True: 'Pathos', False: 'Not Pathos'})
df['ethos'] = df['ethos'].map({True: 'Ethos', False: 'Not Ethos'})
df['premise'] = [word_tokenize(entry) for entry in df['premise']]
df['text'] = [' '.join(entry) for entry in df['premise']]

df_test['premise'] = df_test['premise'].str.lower()
df_test['logos'] = df_test['logos'].map({True: 'Logos', False: 'Not Logos'})
df_test['pathos'] = df_test['pathos'].map({True: 'Pathos', False: 'Not Pathos'})
df_test['ethos'] = df_test['ethos'].map({True: 'Ethos', False: 'Not Ethos'})
df_test['premise'] = [word_tokenize(entry) for entry in df_test['premise']]
df_test['text'] = [' '.join(entry) for entry in df_test['premise']]

In [134]:
df['logos'].value_counts()
df_test['logos'].value_counts()

Logos        538
Not Logos    136
Name: logos, dtype: int64

In [135]:
# train, test = train_test_split(df, test_size=0.2, random_state=0, stratify=df['type'], shuffle=True)

In [136]:
# sem_type = 'pathos'
# train, test = train_test_split(df, test_size=0.2, random_state=0, stratify=df[sem_type])
# print(train[sem_type].value_counts())
# print(test[sem_type].value_counts())
# Train_X, Test_X, Train_Y, Test_Y = train['text'], test['text'], train[sem_type], test[sem_type]
# Encoder = LabelEncoder()
# Train_Y = Encoder.fit_transform(Train_Y)
# Test_Y = Encoder.fit_transform(Test_Y)
# print(np.count_nonzero(Train_Y == 1), np.count_nonzero(Train_Y == 0))
# print(np.count_nonzero(Test_Y == 1), np.count_nonzero(Test_Y == 0))
# Tfidf_vect = TfidfVectorizer(max_features=5000)
# Tfidf_vect.fit(df['text'])
# Train_X_Tfidf = Tfidf_vect.transform(Train_X)
# Test_X_Tfidf = Tfidf_vect.transform(Test_X)
# # Classifier - Algorithm - SVM
# # fit the training dataset on the classifier
# SVM = svm.LinearSVC(C=1.0)
# SVM.fit(Train_X_Tfidf, Train_Y) # predict the labels on validation dataset
# predictions_SVM = SVM.predict(Test_X_Tfidf) # Use accuracy_score function to get the accuracy
# # y = Encoder.inverse_transform(Test_Y)
# # pred = Encoder.inverse_transform(predictions_SVM)
# # print(f"SVM Accuracy Score for {sem_type} -> \n", classification_report(pred, y))
# print(f"SVM Accuracy Score for {sem_type} -> \n", classification_report(Test_Y, predictions_SVM))

In [137]:
train, dev = train_test_split(df, test_size=0.2, random_state=0, stratify=df['type'], shuffle=True)
test = df_test


print(test['pathos'].value_counts())
print(test['logos'].value_counts())
print(test['ethos'].value_counts())

def train_svm(sem_type):
    Train_X, Test_X, Dev_X, Train_Y, Test_Y, Dev_Y,  = train['text'], test['text'], dev['text'], train[sem_type], test[sem_type], dev[sem_type]

    # Construct pipeline
    pipe = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svc', LinearSVC()),
    ])

    pipe.fit(Train_X, Train_Y)

    predictions_SVM_dev = pipe.predict(Dev_X)
    predictions_SVM_test = pipe.predict(Test_X) # Use accuracy_score function to get the accuracy

    print(f"{sem_type} (dev set) -> \n", classification_report(Dev_Y, predictions_SVM_dev))

    print(f"{sem_type} (test set) -> \n", classification_report(Test_Y, predictions_SVM_test))

    eli5.show_weights(pipe)

Not Pathos    358
Pathos        316
Name: pathos, dtype: int64
Logos        538
Not Logos    136
Name: logos, dtype: int64
Not Ethos    617
Ethos         57
Name: ethos, dtype: int64


In [138]:
for sem_type in ['logos', 'ethos', 'pathos']:
    train_svm(sem_type=sem_type)

logos (dev set) -> 
               precision    recall  f1-score   support

       Logos       0.90      0.98      0.94       244
   Not Logos       0.71      0.28      0.40        36

    accuracy                           0.89       280
   macro avg       0.81      0.63      0.67       280
weighted avg       0.88      0.89      0.87       280

logos (test set) -> 
               precision    recall  f1-score   support

       Logos       0.84      1.00      0.92       538
   Not Logos       1.00      0.27      0.43       136

    accuracy                           0.85       674
   macro avg       0.92      0.64      0.67       674
weighted avg       0.88      0.85      0.82       674

ethos (dev set) -> 
               precision    recall  f1-score   support

       Ethos       1.00      0.12      0.21        17
   Not Ethos       0.95      1.00      0.97       263

    accuracy                           0.95       280
   macro avg       0.97      0.56      0.59       280
weighted a

