In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import glob
from itertools import product

# Generate all possible trimers
def generate_trimers():
    bases = ['A', 'C', 'G', 'T']
    return [''.join(p) for p in product(bases, repeat=3)]

# k-mer (trimer) encoding function
def transform_sequence_kmer(sequence, k=3):
    kmers = generate_trimers()
    kmer_counts = dict.fromkeys(kmers, 0)
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i + k]
        if kmer in kmer_counts:
            kmer_counts[kmer] += 1
    return list(kmer_counts.values())

# Function to load data from specific regions
def load_and_transform_data(file_path, pos_range, neg_range):
    with open(file_path, 'r') as file:
        sequences = file.read().split('\n')
    pos_sequences = [seq[pos_range[0]-1:pos_range[1]] for seq in sequences if len(seq) >= pos_range[1]]
    neg_sequences = [seq[neg_range[0]-1:neg_range[1]] for seq in sequences if len(seq) >= neg_range[1]]
    pos_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in pos_sequences])
    pos_data['label'] = 1
    neg_data = pd.DataFrame([transform_sequence_kmer(seq) for seq in neg_sequences])
    neg_data['label'] = 0
    return pd.concat([pos_data, neg_data], ignore_index=True)

# Classifier definitions
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=101),
    'Decision Tree': DecisionTreeClassifier(random_state=101)
}

# Define combinations
combinations = [
    ((5000, 5100), (1, 100))
     
]

# Function to evaluate and collect results for each combination
def evaluate_combinations(file_path, combinations):
    results = []
    for pos_range, neg_range in combinations:
        data = load_and_transform_data(file_path, pos_range, neg_range)
        X, y = data.drop('label', axis=1), data['label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)
        
        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            results.append({
                'Classifier': name,
                'Pos_Range': f"{pos_range[0]}-{pos_range[1]}",
                'Neg_Range': f"{neg_range[0]}-{neg_range[1]}",
                'Accuracy': accuracy_score(y_test, y_pred),
                'Precision': precision_score(y_test, y_pred, zero_division=0),
                'Recall': recall_score(y_test, y_pred, zero_division=0),
                'F1 Score': f1_score(y_test, y_pred, zero_division=0)
            })
        
    return pd.DataFrame(results)

# Evaluate and collect results for the specific file
file_path = 'bacillus_amyloliquifaciens.txt'
results_df = evaluate_combinations(file_path, combinations)
print(f"Results for file: {file_path}")
print(results_df.to_string(index=False))
