In [2]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
url = 'https://raw.githubusercontent.com/synle/machine-learning-sample-dataset/master/liar_dataset/train.tsv'
df = pd.read_csv(url, sep='\t', header=None)

# Convert classes to 3
mapping = {'pants-fire': 0, 'false': 0, 'barely-true': 1, 'half-true': 2, 'mostly-true': 2, 'true': 2}
df[1] = df[1].apply(lambda x: mapping[x])

# Tokenize and preprocess the text data
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()

def tokenize_and_preprocess(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text.lower())

    # Remove stop words and stem the remaining words
    tokens = [stemmer.stem(token) for token in tokens if token not in stopwords]

    # Rejoin the tokens into a single string
    return ' '.join(tokens)

df[2] = df[2].apply(tokenize_and_preprocess)

# Split the dataset into features and labels
X = df[2]
y = df[1]

# Convert the text data to numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# Train a logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X, y)

# Make predictions on the training set and compute metrics
y_pred = clf.predict(X)
conf_mat = confusion_matrix(y, y_pred)
class_report = classification_report(y, y_pred)

# Print the confusion matrix and classification report
print('Confusion Matrix:\n', conf_mat)
print('\nClassification Report:\n', class_report)


Confusion Matrix:
 [[1434   27 1373]
 [ 232  251 1171]
 [ 208   24 5520]]

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.51      0.61      2834
           1       0.83      0.15      0.26      1654
           2       0.68      0.96      0.80      5752

    accuracy                           0.70     10240
   macro avg       0.76      0.54      0.55     10240
weighted avg       0.73      0.70      0.66     10240

