In [None]:
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import matthews_corrcoef
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_data(train_file, test_file):
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)

    train_data.drop_duplicates(inplace=True)
    train_data.dropna(inplace=True)

    return train_data, test_data

def feature_engineering(train_data, test_data):
    def get_kmers(sequence, size=10):
        return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

    all_sequences = pd.concat([train_data['Sequence'], test_data['Sequence']], ignore_index=True)
    vectorizer = CountVectorizer(analyzer=get_kmers)
    X_all = vectorizer.fit_transform(all_sequences)

    X_train = X_all[:len(train_data)]
    X_test = X_all[len(train_data):]

    return X_train, X_test

def train_and_evaluate(X_train_full, y_train_full):
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

    nb_model = MultinomialNB()
    nb_model.fit(X_train_full, y_train_full)

    val_predictions = nb_model.predict(X_val)
    mcc_val = matthews_corrcoef(y_val, val_predictions)
    print("Validation MCC:", mcc_val)

    train_predictions = nb_model.predict(X_train_full)
    mcc_train = matthews_corrcoef(y_train_full, train_predictions)
    print("Training MCC:", mcc_train)

    return nb_model

def predict_and_save(nb_model, X_test, test_data, output_file):
    test_predictions = nb_model.predict(X_test)

    label_encoder = LabelEncoder()
    label_encoder.fit(test_data["Label"])
    test_labels = label_encoder.inverse_transform(test_predictions)

    submission_df = pd.DataFrame({"ID": test_data["ID"], "Label": test_labels})
    submission_df.to_csv(output_file, index=False)

def main(train_file, test_file, output_file):
    train_data, test_data = preprocess_data(train_file, test_file)
    X_train_full, X_test = feature_engineering(train_data, test_data)

    label_encoder = LabelEncoder()
    y_train_full = label_encoder.fit_transform(train_data["Label"])

    nb_model = train_and_evaluate(X_train_full, y_train_full)
    predict_and_save(nb_model, X_test, test_data, output_file)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Naive Bayes Classifier for RNA Sequences")
    parser.add_argument("--train_file", type=str, required=True, help="Path to the training data file")
    parser.add_argument("--test_file", type=str, required=True, help="Path to the test data file")
    parser.add_argument("--output_file", type=str, required=True, help="Path to save the submission file")
    args = parser.parse_args()
    main(args.train_file, args.test_file, args.output_file)
