In [11]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import numpy as np
import codecs
import re
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline


def preprocess_text(text):
    # Add your preprocessing steps here
    # Example: Tokenization, removing stop words, etc.
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetic characters
    return text

# Load and preprocess text data
def load_pres(fname):
    alltxts = []
    alllabs = []
    s = open(fname, 'r', encoding='utf-8')
    for line in s:
        line = line.strip()  # Remove leading/trailing whitespace
        if not line:
            continue
        match = re.match(r"<\d+:\d+:(.)>", line)
        if match:
            label = match.group(1)
            text = re.sub(r"<\d+:\d+:.>", "", line)
            text = preprocess_text(text)
            alltxts.append(text)
            alllabs.append(-1 if 'M' in label else 1)
    return alltxts, alllabs

fname = "./datasets/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts, alllabs = load_pres(fname)

In [12]:
# Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(alltxts)

# Train-test Split
X_train, X_test, y_train, y_test = train_test_split(X, alllabs, test_size=0.2, random_state=42)

# Oversampling
over_sampler = RandomOverSampler(sampling_strategy='minority')
X_over_train, y_over_train = over_sampler.fit_resample(X_train, y_train)

# Define pipeline with oversampling
pipeline = Pipeline(steps=[('model', DecisionTreeClassifier())])

# Train pipeline
pipeline.fit(X_over_train, y_over_train)

# Predict probabilities on test set
probas = pipeline.predict_proba(X_test)

# Save predicted probabilities to a file
np.savetxt("predicted_probabilities.txt", probas)

# Evaluate pipeline
y_pred = pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

F1 Score: 0.8864726901480348


In [17]:
def load_test_data(fname):
    test_texts = []
    with open(fname, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:
                continue
            text = re.sub(r"<\d+:\d+:.>", "", line)  # Remove tags
            test_texts.append(text)
    return test_texts

test_fname = "./datasets/AFDpresidentutf8/corpus.tache1.test.utf8.txt"
test_texts = load_test_data(test_fname)

# Check if there are any test texts
if not test_texts:
    print("No test data found.")
else:
    # Vectorize test data using the same vectorizer
    X_test_data = vectorizer.transform(test_texts)

    # Check if there are any test samples
    if X_test_data.shape[0] == 0:
        print("No test samples found after preprocessing.")
    else:
        # Make predictions on test data
        test_probas = pipeline.predict_proba(X_test_data)

        # Save predicted probabilities on test data to a file
        np.savetxt("test_predicted_probabilities.txt", test_probas[:,0])

print("Class order:", pipeline.classes_)


Class order: [-1  1]
