In [7]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.svm import LinearSVC, OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import eli5
import nltk
from sklearn.pipeline import Pipeline, FeatureUnion
np.random.seed(500)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/borisdejong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
df = pd.read_pickle('../data/v2.0-processed/df_premises.pickle')
df.head()

df = df.astype({'document':'int'})

df_train = df[df.document > 88]
df_test = df[df.document <= 88]

In [9]:
# Lowercase
df['premise'] = df['premise'].str.lower()
df['logos'] = df['logos'].map({True: 'Logos', False: 'Not Logos'})
df['pathos'] = df['pathos'].map({True: 'Pathos', False: 'Not Pathos'})
df['ethos'] = df['ethos'].map({True: 'Ethos', False: 'Not Ethos'})
df['premise'] = [word_tokenize(entry) for entry in df['premise']]
df['text'] = [' '.join(entry) for entry in df['premise']]

In [10]:
df['logos'].value_counts()

Logos        1757
Not Logos     315
Name: logos, dtype: int64

In [11]:
# train, test = train_test_split(df, test_size=0.2, random_state=0, stratify=df['type'], shuffle=True)

In [12]:
# sem_type = 'pathos'
# train, test = train_test_split(df, test_size=0.2, random_state=0, stratify=df[sem_type])
# print(train[sem_type].value_counts())
# print(test[sem_type].value_counts())
# Train_X, Test_X, Train_Y, Test_Y = train['text'], test['text'], train[sem_type], test[sem_type]
# Encoder = LabelEncoder()
# Train_Y = Encoder.fit_transform(Train_Y)
# Test_Y = Encoder.fit_transform(Test_Y)
# print(np.count_nonzero(Train_Y == 1), np.count_nonzero(Train_Y == 0))
# print(np.count_nonzero(Test_Y == 1), np.count_nonzero(Test_Y == 0))
# Tfidf_vect = TfidfVectorizer(max_features=5000)
# Tfidf_vect.fit(df['text'])
# Train_X_Tfidf = Tfidf_vect.transform(Train_X)
# Test_X_Tfidf = Tfidf_vect.transform(Test_X)
# # Classifier - Algorithm - SVM
# # fit the training dataset on the classifier
# SVM = svm.LinearSVC(C=1.0)
# SVM.fit(Train_X_Tfidf, Train_Y) # predict the labels on validation dataset
# predictions_SVM = SVM.predict(Test_X_Tfidf) # Use accuracy_score function to get the accuracy
# # y = Encoder.inverse_transform(Test_Y)
# # pred = Encoder.inverse_transform(predictions_SVM)
# # print(f"SVM Accuracy Score for {sem_type} -> \n", classification_report(pred, y))
# print(f"SVM Accuracy Score for {sem_type} -> \n", classification_report(Test_Y, predictions_SVM))

In [15]:
train, val = train_test_split(df_train, test_size=0.2, random_state=0, stratify=df['type'], shuffle=True)

print(test['pathos'].value_counts())
print(test['logos'].value_counts())
print(test['ethos'].value_counts())
def train_svm(sem_type):
    Train_X, Test_X, Train_Y, Test_Y = train['text'], val['text'], train[sem_type], val[sem_type]

    # Construct pipeline
    pipe = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svc', LinearSVC()),
    ])

    pipe.fit(Train_X, Train_Y)

    predictions_SVM = pipe.predict(Test_X) # Use accuracy_score function to get the accuracy
    print(f"{sem_type} -> \n", classification_report(Test_Y, predictions_SVM))

    eli5.show_weights(pipe)

      document              file_path     split  \
732         80   v2.0/positive/80.xml  positive   
251         73   v2.0/positive/73.xml  positive   
683          5    v2.0/positive/5.xml  positive   
495         27   v2.0/positive/27.xml  positive   
107         89   v2.0/positive/89.xml  positive   
...        ...                    ...       ...   
1915        79   v2.0/negative/79.xml  negative   
590         31   v2.0/positive/31.xml  positive   
1951        79   v2.0/negative/79.xml  negative   
1612       318  v2.0/negative/318.xml  negative   
1377       289  v2.0/negative/289.xml  negative   

                                                premise                type  \
732   [., you, 'll, lose, money, by, taking, hpsp, a...               logos   
251   [most, of, us, would, n't, describe, them, as,...        logos_pathos   
683   [the, incidents, that, stand, out, most, in, o...        logos_pathos   
495                      [that, is, not, my, motive, .]               l

In [14]:
for sem_type in ['logos', 'ethos', 'pathos']:
    train_svm(sem_type=sem_type)

logos -> 
               precision    recall  f1-score   support

       Logos       0.89      0.99      0.94       352
   Not Logos       0.83      0.32      0.46        63

    accuracy                           0.89       415
   macro avg       0.86      0.65      0.70       415
weighted avg       0.88      0.89      0.86       415

ethos -> 
               precision    recall  f1-score   support

       Ethos       0.82      0.31      0.45        29
   Not Ethos       0.95      0.99      0.97       386

    accuracy                           0.95       415
   macro avg       0.88      0.65      0.71       415
weighted avg       0.94      0.95      0.94       415

pathos -> 
               precision    recall  f1-score   support

  Not Pathos       0.68      0.55      0.61       176
      Pathos       0.71      0.81      0.76       239

    accuracy                           0.70       415
   macro avg       0.69      0.68      0.68       415
weighted avg       0.70      0.70      0

