# One-hot encoding 

It is a **binary representation** (1 or 0) of amino acids. The protein sequence is transformed into an L × 20 binary matrix (for a sequence of length L, assuming 20 standard amino acids).<p>
Data can be one-hot encoded using scikit-learn's module **OneHotEncoder** <br>
The documentation can be found here: [sklearn.preprocessing.OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) 

In [None]:
# Import dependencies 

import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from numpy import array

In [None]:
# Define the 20 standard amino acids
AMINO_ACIDS = np.array(list("ACDEFGHIKLMNPQRSTVWY")).reshape(-1, 1)

# Initialize OneHotEncoder 
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(AMINO_ACIDS)  # Fit only on valid amino acids


def one_hot_encode_sequence(sequence):
    
    sequence_array = np.array(list(sequence)).reshape(-1, 1)  # Convert sequence into column format
    encoded_sequence = encoder.transform(sequence_array)  # Apply OneHotEncoder
    return encoded_sequence

In [None]:
# Load dataset (with 'sequence' column)
protein_sequences_file = 'Example_Data.csv'  # CSV file path
df = pd.read_csv(protein_sequences_file)

# Apply one-hot encoding to all sequences
encoded_sequences = np.array([one_hot_encode_sequence(seq) for seq in df['sequence_Swapped_50_55']])

# Reshape into a 2D array 
encoded_sequences_flat = encoded_sequences.reshape(encoded_sequences.shape[0], -1)

In [None]:
# Save encoded numpy array as a csv file (while loading the file put header=None) 
np.savetxt("one_hot_encoded.csv", encoded_sequences_flat, delimiter=",")

In [None]:
print(encoded_sequences.shape)  # Should be (num_sequences, L, 20)