In [6]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Load the dataset
url = 'https://raw.githubusercontent.com/synle/machine-learning-sample-dataset/master/liar_dataset/train.tsv'
df = pd.read_csv(url, sep='\t', header=None)

# Define column names
columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']

# Assign column names to the dataframe
df.columns = columns

# Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub('[^a-zA-Z]+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Join the tokens back into a string
    text = ' '.join(tokens)
    return text

df['statement'] = df['statement'].apply(preprocess_text)

# Split the dataset into train and test sets
X = df['statement']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Train a One-vs-All SVM model for each label separately
labels = df['label'].unique()
models = {}
for label in labels:
    # Convert the label to a binary label (1 for the given label, 0 for all other labels)
    y_train_binary = y_train.apply(lambda x: 1 if x == label else 0)
    # Create an SVM model and fit it to the training data
    svm = OneVsRestClassifier(LinearSVC())
    svm.fit(tfidf.fit_transform(X_train), y_train_binary)
    # Store the trained model for this label
    models[label] = svm

# Evaluate the accuracy of each model on the test set and print the confusion matrix for each label
accs = []
for label, model in models.items():
    # Convert the label to a binary label (1 for the given label, 0 for all other labels)
    y_test_binary = y_test.apply(lambda x: 1 if x == label else 0)
    # Predict the labels for the test set
    y_pred = model.predict(tfidf.transform(X_test))
    # Compute the accuracy score
    acc = accuracy_score(y_test_binary, y_pred)
    accs.append(acc)
    # Print the accuracy score
    print(f'Accuracy for {label}: {acc}')


# Compute the final accuracy by aggregating the accuracies of all models
final_acc = np.mean(accs)
print(f'Final accuracy: {final_acc}')

# Compute the confusion matrix for the final accuracy
y_pred_final = {}
for label, model in models.items():
    y_pred_final[label] = model.predict(tfidf.transform(X_test))
y_pred_final = pd.DataFrame(y_pred_final)

y_pred_final['prediction'] = y_pred_final.idxmax(axis=1)
y_pred_final['actual'] = y_test.values

final_cm = confusion_matrix(y_pred_final['actual'], y_pred_final['prediction'], labels=labels)
print('Confusion matrix for final accuracy:')
print(final_cm)



Accuracy for false: 0.76220703125
Accuracy for half-true: 0.73876953125
Accuracy for mostly-true: 0.775390625
Accuracy for true: 0.81494140625
Accuracy for barely-true: 0.810546875
Accuracy for pants-fire: 0.9228515625
Final accuracy: 0.8041178385416666
Confusion matrix for final accuracy:
[[320  25  20  14  13   9]
 [317  48  37  15  18   3]
 [262  44  38  21  16   1]
 [243  33  28  24  12   0]
 [253  33  24   8  18   3]
 [119   6   5   1  10   7]]
