In [56]:
import numpy as np
import nltk
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score

source_path = "E:\\research\\experiments\\n-p\\n-gram baseline\\min_len\\bert-base_745_all\\"
number_of_labels = 745

# Laden der NLTK-Daten (einmalig erforderlich)
#nltk.download('punkt')

# Funktion zur Tokenisierung
def tokenize(text):
    return nltk.word_tokenize(text.lower())

# Einlesen der Kommentare und Labels aus einer CSV-Datei mit Tabulator-Trennzeichen
def load_data(file_path):
    comments = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) == 2:
                comments.append(row[0])
                labels.append(int(row[1]))
    return comments, labels

# Einlesen der Test-Kommentare aus einer CSV-Datei mit Tabulator-Trennzeichen
def load_test_comments(file_path):
    ids = []
    comments = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) >= 2:
                ids.append(row[0])
                comments.append(row[1])
    return ids, comments

# Speichern der Kommentare und ihrer Vorhersagen in einer CSV-Datei mit Tabulator-Trennzeichen
def save_predictions(ids, comments, predictions, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        for id, comment, prediction in zip(ids, comments, predictions):
            writer.writerow([id, comment, prediction])

# Erstellen und Trainieren des N-Gram-Modells
def train_ngram_model(comments, labels, n=2):
    vectorizer = CountVectorizer(tokenizer=tokenize, ngram_range=(1, n))
    X = vectorizer.fit_transform(comments)
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    loss = log_loss(y_test, y_pred_prob)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Validation Log Loss: {loss}')
    print(f'Validation Accuracy: {accuracy}')
    return model, vectorizer

# Klassifizierung eines neuen Kommentars
def classify_comments(comments, model, vectorizer):
    X_new = vectorizer.transform(comments)
    predictions = model.predict(X_new)
    return predictions

# Hauptfunktion
if __name__ == "__main__":
    # Beispiel für das Einlesen der Trainingsdaten (ändern Sie den Pfad entsprechend)
    comments, labels = load_data(source_path + "training_set.csv")
    
    # Training des Modells
    n = 5  # Beispiel für ein Bigramm-Modell
    model, vectorizer = train_ngram_model(comments, labels, n)

    # Einlesen der Test-Kommentare (ändern Sie den Pfad entsprechend)
    ids, test_comments = load_test_comments(source_path + 'testing_set.csv')
    
    # Klassifizieren der Test-Kommentare
    predictions = classify_comments(test_comments, model, vectorizer)
    
    # Speichern der Ergebnisse in einer Ausgabedatei
    save_predictions(ids, test_comments, predictions, source_path + 'n5\\prediction.tsv')

    print('Predictions saved to predicted_labels.csv')

In [None]:
from trainingdata_preparation import tdp
with open(source_path + "n5\\comparative_data.txt", 'a', encoding='utf-8') as cf:
    cf.write("==== Experiment: {}-gram-baseliner ====\n".format(n))
    cf.write(tdp.evaluation(source_path + 'testing_val_set.csv', source_path + "n5\\prediction.tsv", number_of_labels))