In [250]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
np.random.seed(500)

In [251]:
df = pd.read_pickle('../data/v2.0-processed/df_premises.pickle')
df.head()

Unnamed: 0,document,file_path,split,premise,type,logos,pathos,ethos
0,88,v2.0/positive/88.xml,positive,[47% of all jobs are at risk of being automate...,logos,True,False,False
1,88,v2.0/positive/88.xml,positive,This number will grow grow until the vast majo...,logos_pathos,True,True,False
2,88,v2.0/positive/88.xml,positive,Since some of this automation will inevitably ...,logos_pathos,True,True,False
3,88,v2.0/positive/88.xml,positive,By fortifying themselves in their gated commun...,logos_pathos,True,True,False
4,88,v2.0/positive/88.xml,positive,"Once everyone is dead, they can simply be wipe...",logos_pathos,True,True,False


In [252]:
# Lowercase
df['premise'] = df['premise'].str.lower()
df['logos'] = df['logos'].map({True: 'Logos', False: 'Not Logos'})
df['pathos'] = df['pathos'].map({True: 'Pathos', False: 'Not Pathos'})
df['ethos'] = df['ethos'].map({True: 'Ethos', False: 'Not Ethos'})
df['premise'] = [word_tokenize(entry) for entry in df['premise']]
df['text'] = [' '.join(entry) for entry in df['premise']]

In [253]:
df['logos'].value_counts()

Logos        1757
Not Logos     315
Name: logos, dtype: int64

In [254]:
# train, test = train_test_split(df, test_size=0.2, random_state=0, stratify=df['type'], shuffle=True)

In [255]:
# sem_type = 'pathos'
# train, test = train_test_split(df, test_size=0.2, random_state=0, stratify=df[sem_type])
# print(train[sem_type].value_counts())
# print(test[sem_type].value_counts())
# Train_X, Test_X, Train_Y, Test_Y = train['text'], test['text'], train[sem_type], test[sem_type]
# Encoder = LabelEncoder()
# Train_Y = Encoder.fit_transform(Train_Y)
# Test_Y = Encoder.fit_transform(Test_Y)
# print(np.count_nonzero(Train_Y == 1), np.count_nonzero(Train_Y == 0))
# print(np.count_nonzero(Test_Y == 1), np.count_nonzero(Test_Y == 0))
# Tfidf_vect = TfidfVectorizer(max_features=5000)
# Tfidf_vect.fit(df['text'])
# Train_X_Tfidf = Tfidf_vect.transform(Train_X)
# Test_X_Tfidf = Tfidf_vect.transform(Test_X)
# # Classifier - Algorithm - SVM
# # fit the training dataset on the classifier
# SVM = svm.LinearSVC(C=1.0)
# SVM.fit(Train_X_Tfidf, Train_Y) # predict the labels on validation dataset
# predictions_SVM = SVM.predict(Test_X_Tfidf) # Use accuracy_score function to get the accuracy
# # y = Encoder.inverse_transform(Test_Y)
# # pred = Encoder.inverse_transform(predictions_SVM)
# # print(f"SVM Accuracy Score for {sem_type} -> \n", classification_report(pred, y))
# print(f"SVM Accuracy Score for {sem_type} -> \n", classification_report(Test_Y, predictions_SVM))

In [262]:
train, test = train_test_split(df, test_size=0.2, random_state=0, stratify=df['type'], shuffle=True)
print(test['pathos'].value_counts())
print(test['logos'].value_counts())
print(test['ethos'].value_counts())
def train_svm(sem_type):
    Train_X, Test_X, Train_Y, Test_Y = train['text'], test['text'], train[sem_type], test[sem_type]
    Encoder = LabelEncoder()
    Train_Y = Encoder.fit_transform(Train_Y)
    Test_Y = Encoder.fit_transform(Test_Y)
    Tfidf_vect = TfidfVectorizer(max_features=5000)
    Tfidf_vect.fit(df['text'])
    Train_X_Tfidf = Tfidf_vect.transform(Train_X)
    Test_X_Tfidf = Tfidf_vect.transform(Test_X)
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    SVM = svm.LinearSVC(C=1.0)
    SVM.fit(Train_X_Tfidf, Train_Y) # predict the labels on validation dataset
    predictions_SVM = SVM.predict(Test_X_Tfidf) # Use accuracy_score function to get the accuracy
    print(f"{sem_type} -> \n", classification_report(Test_Y, predictions_SVM, target_names=Encoder.classes_))

Pathos        239
Not Pathos    176
Name: pathos, dtype: int64
Logos        352
Not Logos     63
Name: logos, dtype: int64
Not Ethos    386
Ethos         29
Name: ethos, dtype: int64


In [261]:
for sem_type in ['logos', 'ethos', 'pathos']:
    train_svm(sem_type=sem_type)

logos -> 
               precision    recall  f1-score   support

       Logos       0.89      0.99      0.94       352
   Not Logos       0.83      0.32      0.46        63

    accuracy                           0.89       415
   macro avg       0.86      0.65      0.70       415
weighted avg       0.88      0.89      0.86       415

ethos -> 
               precision    recall  f1-score   support

       Ethos       0.90      0.31      0.46        29
   Not Ethos       0.95      1.00      0.97       386

    accuracy                           0.95       415
   macro avg       0.93      0.65      0.72       415
weighted avg       0.95      0.95      0.94       415

pathos -> 
               precision    recall  f1-score   support

  Not Pathos       0.70      0.55      0.61       176
      Pathos       0.71      0.83      0.77       239

    accuracy                           0.71       415
   macro avg       0.71      0.69      0.69       415
weighted avg       0.71      0.71      0