# Question 4: Optimising pre-processing and feature extraction (50 marks)

**Note:** it is advisable to implement question 4 in a separate notebook where you further develop the pre-processing and feature extraction functions you implemented above.

In [213]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from random import Random

In [214]:
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))


def split_and_preprocess_data(percentage):
    num_samples = len(raw_data)
    n_train = int((percentage * num_samples))
    for (text, label) in raw_data[:n_train]:
        train_data.append((text, label))
    for (text, label) in raw_data[n_train:]:
        test_data.append((text, label))

def parse_data_line(data_line):
    """Return a tuple of the label as just FAKE or REAL and the statement"""
    return (data_line[1], data_line[2])

def pre_process(text):
    """Return a list of tokens"""
    return text.split()

In [215]:
#solution
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import Counter

labels_hm = ['positive', 'negative']


def cross_validate(dataset, folds, classifier_fn):
    """
      Manual K-fold cross-validation over the *training* data.

      Parameters
      ----------
      dataset : list[tuple[dict, str]]
          List of (feature_dict, label) pairs.
      folds : int
          Number of folds (e.g., 10).

      Returns
      -------
      tuple(float, float, float, float)
          Average (precision, recall, F1, accuracy) across folds.
      """

    results = []
    fold_size = int(len(dataset)/folds) + 1
    # Leave-out-chunk CV
    for i in range(0,len(dataset),int(fold_size)):
        # insert code here that trains and tests on the 10 folds of data in the dataset

        print("Fold start on items %d - %d" % (i, i+fold_size))
        # FILL IN THE METHOD HERE
        # Split current chunk as validation; rest as training
        test_fold = dataset[i:i+fold_size]
        train_fold = dataset[:i] + dataset[i+fold_size:]

        classifier = classifier_fn(train_fold)

        # Prepare features/labels for validation and predict
        test_features = [features for (features, label) in test_fold]
        true_labels = [label for (features, label) in test_fold]

        predicted_labels = predict_labels(test_features, classifier)

        # Compute standard metrics (weighted to handle any class imbalance)
        precision, recall, f1, _ = precision_recall_fscore_support(
            true_labels, predicted_labels, average='weighted', zero_division=0
        )
        accuracy = accuracy_score(true_labels, predicted_labels)

        # Store this fold's metrics
        results.append((precision, recall, f1, accuracy))

        # Save the first fold's outputs for Q3 analysis/plots

    # Average metrics across all folds
    cv_results = tuple(float(np.mean([r[k] for r in results])) for k in range(4))
    print("\nAverage Precision: %.4f\nAverage Recall: %.4f\nAverage F1: %.4f\nAverage Accuracy: %.4f" %
          tuple(cv_results))

    return cv_results

In [216]:
# MAIN

# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'sentiment-dataset.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path)

# from collections import Counter

# # raw_data = [(text1, label1), (text2, label2), ...]

# labels = [label for (_, label) in raw_data]
# label_counts = Counter(labels)

# Data shuffling to avoid data imbalance among the folds during cross validation
rng = Random(42)
shuffled = raw_data[:]
rng.shuffle(shuffled)
raw_data = shuffled

# print(label_counts)
# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# train_texts = [t for (t, _) in train_data]
# vec_probe = CountVectorizer(
#             tokenizer=tokenize_lemmatize,
#             lowercase=False,
#             ngram_range=(1,2),
#             binary=True,
#         )
# _ = vec_probe.fit_transform(train_texts)
# vocab_size = len(vec_probe.get_feature_names_out())

# We print the number of training samples and the number of features after the split
print(
    f"After split, {len(raw_data)} rawData, {len(train_data)} trainData, {len(test_data)} testData",
    f"Training Samples: {len(train_data)}",
    # f"Vectorizer vocab (train-only): {vocab_size}",
    sep='\n'
)


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 33540 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 33540 rawData, 26832 trainData, 6708 testData
Training Samples: 26832


# Step 1 & 2 of Processing - Improve the preprocessing. Which tokens might you want to throw out or preserve? What about punctuation? Do not forget normalisation, lemmatising, stop word removal - what aspects of this might be useful?

In [217]:

# === Tokenizer & preprocessing helpers ===
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from functools import lru_cache

# Download required WordNet data for lemmatization
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Keep negations in stopwords (they matter for sentiment)
NEGATIONS = {"no", "not", "nor", "never", "n't"}
STOP_SK = set(ENGLISH_STOP_WORDS) - NEGATIONS

# Regular expressions for common text patterns
URL_RE       = re.compile(r'(https?://\S+|www\.\S+)', re.IGNORECASE)
USER_RE      = re.compile(r'@\w+')
HASHTAG_RE   = re.compile(r'#(\w+)')
NUM_RE       = re.compile(r'(?<!\w)(\d+([.,]\d+)*)(?!\w)')
ELONG_RE     = re.compile(r'(.)\1{2,}', re.IGNORECASE)  # e.g., "soooo" -> "soo"
DOTS_RE      = re.compile(r'\.{3,}')                   # ellipsis "..."
RETWEET_RE   = re.compile(r'^RT\s')
PUNCT_PATTERN = re.compile(r'[^\w\s!?\.<>]')            # keep "!", "?" for sentiment
EMOTICON_RE  = r'[:;=8][\-^]?[)D(\]/\\OpP]'             # :) :( :D ;-) etc.

# Tokenizer splits words, emojis, tags, and punctuation
TOKEN_SPLIT = re.compile(
    r"<[^<>]+>|"                # <URL>, <USER>, etc.
    r"(?:%s)|" % EMOTICON_RE +  # emoticons
    r"\w+[:']\w+|\w+|"          # words (incl. "it's")
    r"[^\w\s]"                  # leftover punctuation
)

# Emoticon set for quick lookup
EMOTICONS = {":)",":-)",":D",":-D",":(",":-(",";)",";-)",":P",":-P",":'(","XD","xD",":-|",":/",":-/"}


@lru_cache(maxsize=200_000)
def lemma_cached(token: str) -> str:
    """Lemmatize a token with caching for speed."""
    t = lemmatizer.lemmatize(token, 'v')  # try verb
    if t == token:
        t = lemmatizer.lemmatize(token, 'n')  # fallback to noun
    return t


def tokenize_lemmatize(text: str) -> list[str]:
    """
    Custom tweet tokenizer:
    - Replaces URLs, mentions, numbers, retweets with markers
    - Handles hashtags, repeated characters, ellipses
    - Keeps key punctuation and emoticons
    - Lowercases, strips punctuation, lemmatizes long alphabetic tokens
    """

    # Replace common patterns with markers
    #1st step pf pre-processing
    text = RETWEET_RE.sub(' <RT> ', text)
    text = URL_RE.sub(' <URL> ', text)
    text = USER_RE.sub(' <USER> ', text)
    text = NUM_RE.sub(' <NUM> ', text)
    text = HASHTAG_RE.sub(lambda m: ' ' + m.group(1) + ' ', text)

    #2nd step pf pre-processing
    text = ELONG_RE.sub(r"\1\1", text)   # shorten long repeated chars
    text = DOTS_RE.sub(' ... ', text)    # normalize ellipsis

    raw = TOKEN_SPLIT.findall(text)
    tokens, add_allcaps = [], False

    for tok in raw:
        if not tok.strip():
            continue

        # Keep emoticons and certain punctuation as-is
        if tok in EMOTICONS or tok in {"!", "?", "..."}:
            tokens.append(tok)
            continue

        # Remove punctuation (except inside markers)
        if not (tok.startswith('<') and tok.endswith('>')):
            tok = PUNCT_PATTERN.sub('', tok)
            if not tok:
                continue

        # Flag for all-caps emphasis
        if tok.isalpha() and tok.isupper() and len(tok) >= 2:
            add_allcaps = True

        # Lowercase and lemmatize normal words
        if tok not in {"<URL>", "<USER>", "<NUM>", "<RT>"}:
            tok = tok.lower()
        #Remove stopwords
        if tok in STOP_SK:
          continue
        if tok.isalpha() and len(tok) > 2:
            tok = lemma_cached(tok)

        tokens.append(tok)

    # Append special token if tweet had ALL CAPS words
    if add_allcaps:
        tokens.append("__ALLCAPS__")

    return tokens



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [218]:
# test
text = "RT @colonelkickhead: Another ;-) HAPPY bloody instant restaurant week?!?! Seriously! They just jumped the shark riding two other sharks powered by sh…"
tokenize_lemmatize(text)

['<RT>',
 '<USER>',
 ';-)',
 'happy',
 'bloody',
 'instant',
 'restaurant',
 'week',
 '?',
 '!',
 '?',
 '!',
 'seriously',
 '!',
 'just',
 'jump',
 'shark',
 'rid',
 'shark',
 'power',
 'sh',
 '__ALLCAPS__']

# Step 1, 2 & 3 of Processing Combined

In [219]:
#Step 3 Processing
# TRAINING AND VALIDATING OUR CLASSIFIER
def train_classifier(data):
    print("Training Classifier...")
    X = [x for (x, _) in data]
    y = [y for (_, y) in data]
    model = Pipeline([
        ('vect', CountVectorizer(
            # tokenizer=pre_process,
            tokenizer=tokenize_lemmatize,
            lowercase=False,
            # ngram_range=(1, 1),
            ngram_range=(1,2),
            binary=True,
        )),
        ('svc', LinearSVC(random_state=42, class_weight='balanced')),
    ])
    model.fit(X, y)
    return model

def predict_labels(samples, classifier):
    return classifier.predict(samples)

In [220]:
# cross_validate(train_data, folds=10, classifier_fn=train_classifier) # will work and output overall performance of p, r, f-score when cv implemented



# Step 3 of Processing - Think about the features: what could you use other than unigram tokens? It may be useful to look beyond single words to combinations of words or characters. Also the feature weighting scheme: what could you do other than using binary values

In [221]:
#Step 3 Processing (non binary)
from sklearn.feature_extraction.text import TfidfVectorizer

def train_classifier_tfidf(data):
    X = [t for (t, _) in data]
    y = [y for (_, y) in data]

    model = Pipeline([
        ('vect', TfidfVectorizer(
            tokenizer=pre_process,
            # tokenizer=tokenize_lemmatize,
            lowercase=False,
            # ngram_range=(1, 2),
            ngram_range=(1, 1),       # still unigrams here
            sublinear_tf=True         # log scale TF
        )),
        ('svc', LinearSVC(random_state=42, class_weight='balanced'))
    ])

    model.fit(X, y)
    return model
# cross_validate(train_data, folds=10, classifier_fn=train_classifier_tfidf) # will work and output overall performance of p, r, f-score when cv implemented


In [222]:
#Step 3 Processing (words+char both)
from sklearn.pipeline import FeatureUnion

def train_classifier_char_and_words(data):
    X = [t for (t, _) in data]
    y = [y for (_, y) in data]

    word_vect = TfidfVectorizer(
        tokenizer=tokenize_lemmatize,
        lowercase=False,
        ngram_range=(1, 2),
        sublinear_tf=True
    )

    char_vect = TfidfVectorizer(
        analyzer='char',
        ngram_range=(3, 5),     # 3-5 character n-grams
        min_df=3
    )

    model = Pipeline([
        ('features', FeatureUnion([
            ('word', word_vect),
            ('char', char_vect)
        ])),
        ('svc', LinearSVC(random_state=42, class_weight='balanced'))
    ])

    model.fit(X, y)
    return model
# cv_results = cross_validate(train_data, folds=10, classifier_fn=train_classifier_char_and_words)

# Step 4 of Processing - You could add extra stylistic features like the number of words per sentence

In [223]:
#Step 4 Processing (Extra Stylistic Features)
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix
import numpy as np

class StylisticFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """Extracts stylistic features from raw text X."""
        feature_values = []
        for text in X:
            sentences = text.split('.')  # naive sentence split
            num_sentences = max(1, len(sentences))
            num_words = len(text.split())
            avg_words_per_sentence = num_words / num_sentences

            feature_values.append([num_words, num_sentences, avg_words_per_sentence])

        # convert into sparse matrix so it can be used in pipeline
        return csr_matrix(np.array(feature_values))

def train_classifier_with_style_feats(data):
    X = [t for (t, _) in data]
    y = [y for (_, y) in data]

    word_vectorizer = TfidfVectorizer(
        tokenizer=tokenize_lemmatize,
        lowercase=False,
        ngram_range=(1, 2),
        sublinear_tf=True
    )

    # combine text features + stylistic features
    combined_feats = FeatureUnion([
        ('tfidf', word_vectorizer),
        ('style', StylisticFeatures())
    ])

    model = Pipeline([
        ('features', combined_feats),
        ('svc', LinearSVC(random_state=42, class_weight= 'balanced'))
    ])

    model.fit(X, y)
    return model


In [224]:
# cv_results = cross_validate(train_data, folds=10, classifier_fn=train_classifier_with_style_feats)

# Step 5 of Processing - You could consider playing with the parameters of the SVM cost parameter? per-class weighting?

In [225]:
#Step 5 Processing (SVM Parameters)
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

def train_classifier_svm_tuned(data, C=0.5):
    X = [t for (t, _) in data]
    y = [y for (_, y) in data]

    model = Pipeline([
        ('vect', TfidfVectorizer(
            tokenizer=tokenize_lemmatize,
            lowercase=False,
            ngram_range=(1, 2),
            sublinear_tf=True
        )),
        ('svc', LinearSVC(
            C=C,                                 # cost parameter
            class_weight='balanced',
            random_state=42
        ))
    ])

    model.fit(X, y)
    return model


In [226]:
# for C in [0.1, 0.5, 1.0, 2.0]:
#     print(f"\nSVM cross-validation with C = {C}, class_weight = 'balanced'")
#     model_fn = lambda train_fold, C=C: train_classifier_svm_tuned(
#         train_fold, C=C
#     )
#     scores = cross_validate(train_data, folds=10, classifier_fn=model_fn)
#     print("F1 Score:", scores[2])

# Step 6 of Processing - You could do some feature selection, limiting the numbers of features through different controls on e.g. the vocabulary

In [227]:
# Step 6 Processing (Limiting no of features to 10k)
def train_classifier_limited_vocab(data):
    X = [t for (t, _) in data]
    y = [y for (_, y) in data]

    model = Pipeline([
        ('vect', TfidfVectorizer(
            tokenizer=tokenize_lemmatize,
            lowercase=False,
            ngram_range=(1, 2),
            max_features=10000,       # limit features to top 10k
            min_df=3,                 # minimum docs per term
            max_df=0.9                # ignore too common terms
        )),
        ('svc', LinearSVC(random_state=42, class_weight='balanced'))
    ])

    model.fit(X, y)
    return model
# cross_validate(train_data, folds=10, classifier_fn=train_classifier_limited_vocab)

In [228]:
# Step 6 Processing (Keeping only k best features)
from sklearn.feature_selection import SelectKBest, chi2

def train_classifier_with_chi2(data, k):
    X = [t for (t, _) in data]
    y = [y for (_, y) in data]

    model = Pipeline([
        ('vect', TfidfVectorizer(
            tokenizer=tokenize_lemmatize,
            lowercase=False,
            ngram_range=(1, 2)
        )),
        ('chi2', SelectKBest(chi2, k=k)),  # Keep only top k features
        ('svc', LinearSVC(random_state=42, class_weight = 'balanced'))
    ])

    model.fit(X, y)
    return model


In [229]:
# for k in [4000, 8000, 12000]:
#     print(f"\nTesting Chi2 feature selection with k={k}")
#     model_fn = lambda train_fold, k=k: train_classifier_with_chi2(train_fold, k=k)
#     cross_validate(train_data, folds=10, classifier_fn=model_fn)

# Step 7 of Processing - You could use external resources like the opinion lexicon available

In [230]:
# Step 7 Processing (external resource opinion lexicon usage)
#Please upload the positive-word and negative word text to the folder before running

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix
import numpy as np

class OpinionLexiconFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, pos_path, neg_path, tokenizer):
        self.pos_path = pos_path
        self.neg_path = neg_path
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        # load lexicons
        self.pos_words = self._load_lexicon(self.pos_path)
        self.neg_words = self._load_lexicon(self.neg_path)
        return self

    def _load_lexicon(self, path):
        lex = set()
        with open(path, 'r', encoding='latin-1') as f:
            for line in f:
                if not line.startswith(';') and line.strip():
                    lex.add(line.strip())
        return lex

    def transform(self, X):
        features = []
        for text in X:
            tokens = self.tokenizer(text)
            pos_count = sum(1 for t in tokens if t in self.pos_words)
            neg_count = sum(1 for t in tokens if t in self.neg_words)
            sentiment_score = pos_count - neg_count
            features.append([pos_count, neg_count, sentiment_score])

        return csr_matrix(np.array(features))

def train_classifier_with_lexicon(data):
    X = [t for (t, _) in data]
    y = [y for (_, y) in data]

    word_tfidf = TfidfVectorizer(
        tokenizer=tokenize_lemmatize,
        lowercase=False,
        ngram_range=(1, 2)
    )

    lexicon_feats = OpinionLexiconFeatures(
        pos_path='positive-words.txt',
        neg_path='negative-words.txt',
        tokenizer=tokenize_lemmatize,
    )

    combined_feats = FeatureUnion([
        ('tfidf', word_tfidf),
        ('lexicon', lexicon_feats)
    ])

    model = Pipeline([
        ('features', combined_feats),
        ('svc', LinearSVC(random_state=42, class_weight='balanced'))
    ])

    model.fit(X, y)
    return model


In [231]:
# results = cross_validate(train_data, folds=10, classifier_fn=train_classifier_with_lexicon)
# print(results)


# Testing the most effective combinations from the above to achieve the best model

In [232]:
# -----------------------------
# Best classifier (word + char TF-IDF + lexicon + chi2 + LinearSVC)
# -----------------------------
import os
import numpy as np
from scipy.sparse import csr_matrix, hstack
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC

# -----------------------------
# Safe chi2 selector (won't crash if k > #features)
# -----------------------------
class SafeSelectKBest(BaseEstimator, TransformerMixin):
    def __init__(self, score_func=chi2, k=12000):
        self.score_func = score_func
        self.k = k
        self.selector_ = None

    def fit(self, X, y=None):
        k_use = self.k
        if isinstance(k_use, int):
            k_use = min(k_use, X.shape[1])  # clamp to available features
        self.selector_ = SelectKBest(self.score_func, k=k_use).fit(X, y)
        return self

    def transform(self, X):
        return self.selector_.transform(X)

# -----------------------------
# Opinion Lexicon transformer (always non-negative outputs for chi2)
# features: [pos_count, neg_count, pos_plus_neg]
# -----------------------------
class OpinionLexiconFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, pos_path='positive-words.txt', neg_path='negative-words.txt', tokenizer=None):
        self.pos_path = pos_path
        self.neg_path = neg_path
        self.tokenizer = tokenizer
        self.pos_words = None
        self.neg_words = None

    def _load(self, path):
        if not os.path.exists(path):
            return None
        words = []
        with open(path, encoding='latin-1') as f:
            for line in f:
                w = line.strip()
                if not w or w.startswith(';'):
                    continue
                words.append(w.lower())
        return set(words)

    def fit(self, X, y=None):
        self.pos_words = self._load(self.pos_path)
        self.neg_words = self._load(self.neg_path)
        return self

    def transform(self, X):
        # if files missing or no tokenizer, return zeros (3 cols)
        if self.pos_words is None or self.neg_words is None or self.tokenizer is None:
            Z = np.zeros((len(X), 3), dtype=np.float32)
            return csr_matrix(Z)

        rows = []
        for text in X:
            toks = self.tokenizer(text)
            pc = sum(1 for t in toks if t in self.pos_words)
            nc = sum(1 for t in toks if t in self.neg_words)
            rows.append([pc, nc, pc + nc])   # <-- all non-negative
        return csr_matrix(np.asarray(rows, dtype=np.float32))

# -----------------------------
# Best classifier (word + char TF-IDF + lexicon + chi2 + LinearSVC)
# -----------------------------
def best_classifier(train_fold, k=12000, C=0.92):
    X = [t for (t, _) in train_fold]
    y = [y for (_, y) in train_fold]

    word_vect = TfidfVectorizer(
        tokenizer=tokenize_lemmatize,
        lowercase=False,
        ngram_range=(1, 2),
        min_df=2,
        max_df=1.0,
        sublinear_tf=True,
        dtype=np.float32
    )

    char_vect = TfidfVectorizer(
        analyzer='char',
        ngram_range=(3, 5),
        min_df=4,
        sublinear_tf=True,
        dtype=np.float32
    )

    lexicon_feats = OpinionLexiconFeatures(
        pos_path='positive-words.txt',
        neg_path='negative-words.txt',
        tokenizer=tokenize_lemmatize
    )

    combined_feats = FeatureUnion([
        ('word', word_vect),
        ('char', char_vect),
        ('lexicon', lexicon_feats),
    ])

    model = Pipeline([
        ('features', combined_feats),
        ('chi2', SelectKBest(chi2, k=k)),                 # supervised selection (non-negative only)
        ('svc', LinearSVC(C=C, random_state=42, class_weight='balanced'))
    ])

    model.fit(X, y)
    return model

# scores = cross_validate(train_data, folds=10, classifier_fn=best_classifier)

In [233]:
# Finally, check the accuracy of your classifier by training on all the traning data
# and testing on the test set
# Will only work once all functions are complete
from sklearn import metrics
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = best_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

    # # Confusion matrix + classification report
    # print("\nConfusion Matrix:")
    # print(metrics.confusion_matrix(test_true, test_pred))
    # print("\nClassification Report:")
    # print(classification_report(test_true, test_pred))

    # # Identify false positives and false negatives for positive label
    # false_positives = []
    # false_negatives = []

    # for text, true_label, pred_label in zip(test_data, test_true, test_pred):
    #     if true_label == "negative" and pred_label == "positive":
    #         false_positives.append(text)
    #     elif true_label == "positive" and pred_label == "negative":
    #         false_negatives.append(text)

    # Print a few examples
    # print(f"\nFalse Positives ({len(false_positives)}):")
    # for fp in false_positives[:10]:
    #     print("→", fp)

    # print(f"\nFalse Negatives ({len(false_negatives)}):")
    # for fn in false_negatives[:10]:
    #     print("→", fn)

('@chaddockr @LexStarwalker Your conversation on race/species in the #CypherSystem gave me much to think about. I may make a post on Google+.', 'positive')




Done training!
Precision: 0.890512
Recall: 0.888939
F Score:0.889495
