In [1]:
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET

# Construct empty DataFrame
df_sentences = pd.DataFrame(columns=[
    'document', 'file_path', 'split', 'text', 'type'
])

df_sentences_test = df_sentences

def parse_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    documents = []

    for elem in root.iter():
        if elem.tag == "OP" or elem.tag == "reply":
            all_descendants = [b for b in elem.iter() if b is not elem]
            for desc in all_descendants:
                documents.append((desc.text, desc.tag))
    return documents

# Retrieve premises from all XML files
for split in ['positive', 'negative']:
    for path in glob.glob(os.path.join(f'v2.0/{split}', '*.xml')):
        try:
            for premise, argument in parse_xml(path):
                document = int(path.split('/')[2].split('.')[0])
                if document > 95:
                    df_sentences_test = pd.concat([df_sentences_test, pd.DataFrame({
                        'document': [document],
                        'file_path': [path],
                        'split': [path.split("/")[1]],
                        'text': [premise],
                        'type': [argument],
                    })], axis=0, ignore_index=True)
                elif document <= 95:
                    df_sentences = pd.concat([df_sentences, pd.DataFrame({
                        'document': [document],
                        'file_path': [path],
                        'split': [path.split("/")[1]],
                        'text': [premise],
                        'type': [argument],
                    })], axis=0, ignore_index=True)
        except ET.ParseError:
            pass

#pickle.dump( df_sentences, open( "./v2.0-processed/df_sentences.pickle", "wb" ) )
df_sentences.to_pickle('./v2.0-processed/df_sentences.pickle')
df_sentences_test.to_pickle('./v2.0-processed/df_sentences_test.pickle')

In [3]:
df = df_sentences

# Preprocess the train data
documents = df["text"].tolist()
labels = df["type"].tolist()


X_train, X_test, y_train, y_test = train_test_split(documents, labels, test_size=0.2, random_state=1, stratify=labels)

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1, stratify=y_train) # 0.25 x 0.8 = 0.2

random_state = 44

# Construct pipeline
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', LinearSVC(random_state=random_state)),
])

# Convert labels to numerical values
#encoder = LabelBinarizer()
#y_train_bin = encoder.fit_transform(y_train)
#y_train = np.argmax(y_train_bin, axis=1)

pipe.fit(X_train, y_train)

# Predict dev
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))


# Predict test
#y_pred = pipe.predict(X_test)
#print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       claim       0.66      0.60      0.63       248
     premise       0.71      0.76      0.74       325

    accuracy                           0.69       573
   macro avg       0.69      0.68      0.68       573
weighted avg       0.69      0.69      0.69       573



In [4]:
# Get the names of each feature
feature_names = pipe.named_steps["tfidf"].get_feature_names()

# Get the coefficients of each feature
coefs = pipe.named_steps["svc"].coef_.flatten()

import pandas as pd
# Zip coefficients and names together and make a DataFrame
zipped = zip(feature_names, coefs)
df = pd.DataFrame(zipped, columns=["feature", "value"])
# Sort the features by the absolute value of their coefficient
df["abs_value"] = df["value"].apply(lambda x: abs(x))
df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
df = df.sort_values("abs_value", ascending=False)

import seaborn as sns
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
sns.barplot(x="feature",
            y="value",
            data=df.head(20),
           palette=df.head(20)["colors"])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
ax.set_title("Top 20 Features", fontsize=25)
ax.set_ylabel("Coef", fontsize=22)
ax.set_xlabel("Feature Name", fontsize=22)



NameError: name 'plt' is not defined

In [None]:
# Get the names of each feature
feature_names = pipe.named_steps["tfidf"].get_feature_names()

# Get the coefficients of each feature
coefs = pipe.named_steps["svc"].coef_.flatten()

import pandas as pd
# Zip coefficients and names together and make a DataFrame
zipped = zip(feature_names, coefs)
df = pd.DataFrame(zipped, columns=["feature", "value"])
# Sort the features by the absolute value of their coefficient
df["abs_value"] = df["value"].apply(lambda x: abs(x))
df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
df = df.sort_values("abs_value", ascending=False)

import seaborn as sns
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
sns.barplot(x="feature",
            y="value",
            data=df.head(20),
           palette=df.head(20)["colors"])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
ax.set_title("Top 20 Features", fontsize=25)
ax.set_ylabel("Coef", fontsize=22)
ax.set_xlabel("Feature Name", fontsize=22)