**TEMPLATE BELOW**

In [None]:
"""
import numpy as np
import pandas as pd


# these are dummy models
class MLModel():
    def __init__(self) -> None:
        pass

    def train(self, X, y):
        NotImplemented

    def predict(self, X):
        NotImplemented

class TextSeqModel(MLModel):
    def __init__(self) -> None:
        pass

    def predict(self, X):# random predictions
        return np.random.randint(0,2,(len(X)))


class EmoticonModel(MLModel):
    def __init__(self) -> None:
        pass

    def predict(self, X):# random predictions
        return np.random.randint(0,2,(len(X)))

class FeatureModel(MLModel):
    def __init__(self) -> None:
        pass

    def predict(self, X): # random predictions
        return np.random.randint(0,2,(len(X)))

class CombinedModel(MLModel):
    def __init__(self) -> None:
        pass

    def predict(self, X1, X2, X3): # random predictions
        return np.random.randint(0,2,(len(X1)))


def save_predictions_to_file(predictions, filename):
    with open(filename, 'w') as f:
        for pred in predictions:
            f.write(f"{pred}\n")

if __name__ == '__main__':
    # read datasets
    test_feat_X = np.load("datasets/test/test_feature.npz", allow_pickle=True)['features']
    test_emoticon_X = pd.read_csv("datasets/test/test_emoticon.csv")['input_emoticon'].tolist()
    test_seq_X = pd.read_csv("datasets/test/test_text_seq.csv")['input_str'].tolist()

    # your trained models
    feature_model = FeatureModel()
    text_model = TextSeqModel()
    emoticon_model  = EmoticonModel()
    best_model = CombinedModel()

    # predictions from your trained models
    pred_feat = feature_model.predict(test_feat_X)
    pred_emoticons = emoticon_model.predict(test_emoticon_X)
    pred_text = text_model.predict(test_seq_X)
    pred_combined = best_model.predict(test_feat_X, test_emoticon_X, test_seq_X)

    # saving prediction to text files
    save_predictions_to_file(pred_feat, "pred_feat.txt")
    save_predictions_to_file(pred_emoticons, "pred_emoticon.txt")
    save_predictions_to_file(pred_text, "pred_text.txt")
    save_predictions_to_file(pred_combined, "pred_combined.txt")
  """

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import joblib  # For saving the models


In [None]:
# Load datasets
def load_data():
    # Load Emoticon Dataset
    train_emoticon_df = pd.read_csv("/content/drive/MyDrive/datasets/train/train_emoticon.csv")
    train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
    train_emoticon_Y = train_emoticon_df['label'].tolist()

    # Load Text Sequence Dataset
    train_seq_df = pd.read_csv("/content/drive/MyDrive/datasets/train/train_text_seq.csv")
    train_seq_X = train_seq_df['input_str'].tolist()  # Keep input_str as string to preserve precision
    train_seq_Y = train_seq_df['label'].tolist()

    # Load Features Dataset
    train_feat = np.load("/content/drive/MyDrive/datasets/train/train_feature.npz", allow_pickle=True)
    train_feat_X = train_feat['features']  # Assuming features are already in usable format
    train_feat_Y = train_feat['label']

    # Load Validation datasets similarly
    val_emoticon_df = pd.read_csv("/content/drive/MyDrive/datasets/valid/valid_emoticon.csv")
    val_emoticon_X = val_emoticon_df['input_emoticon'].tolist()
    val_emoticon_Y = val_emoticon_df['label'].tolist()

    val_seq_df = pd.read_csv("/content/drive/MyDrive/datasets/valid/valid_text_seq.csv")
    val_seq_X = val_seq_df['input_str'].tolist()  # Keep input_str as string
    val_seq_Y = val_seq_df['label'].tolist()

    val_feat = np.load("/content/drive/MyDrive/datasets/valid/valid_feature.npz", allow_pickle=True)
    val_feat_X = val_feat['features']  # This should also be in 3D
    val_feat_Y = val_feat['label']

    return (train_emoticon_X, train_emoticon_Y), (train_seq_X, train_seq_Y), (train_feat_X, train_feat_Y), \
           (val_emoticon_X, val_emoticon_Y), (val_seq_X, val_seq_Y), (val_feat_X, val_feat_Y)


INFERIOR MODEL BELOW

In [None]:
def extract_emoticon_features(emoticon_list, encoder=None):
    if encoder is None:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # One-hot encoding
        emoticon_encoded = encoder.fit_transform(np.array(emoticon_list).reshape(-1, 1))
    else:
        emoticon_encoded = encoder.transform(np.array(emoticon_list).reshape(-1, 1))
    return emoticon_encoded, encoder

In [None]:
def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    accuracy = accuracy_score(y_val, predictions)
    return accuracy

In [None]:
if __name__ == '__main__':
    (train_emoticon_X, train_emoticon_Y), (train_seq_X, train_seq_Y), (train_feat_X, train_feat_Y), \
    (val_emoticon_X, val_emoticon_Y), (val_seq_X, val_seq_Y), (val_feat_X, val_feat_Y) = load_data()

    # Extract features for emoticons
    train_emoticon_X_encoded, encoder = extract_emoticon_features(train_emoticon_X)
    val_emoticon_X_encoded, _ = extract_emoticon_features(val_emoticon_X, encoder)

    # Initialize models
    lr_model = LogisticRegression(max_iter=1000)
    svm_model = SVC()
    dt_model = DecisionTreeClassifier()

    # Train models and evaluate
    emoticon_accuracy = train_and_evaluate(lr_model, train_emoticon_X_encoded, train_emoticon_Y,
                                           val_emoticon_X_encoded, val_emoticon_Y)

    seq_accuracy = train_and_evaluate(svm_model, np.array(train_seq_X).reshape(-1, 1), train_seq_Y,
                                       np.array(val_seq_X).reshape(-1, 1), val_seq_Y)

    if train_feat_X.ndim == 3:
        train_feat_X = train_feat_X.reshape(train_feat_X.shape[0], -1)

    # val_feat = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)
    # val_feat_X = val_feat['features']
    # val_feat_Y = val_feat['label']

    if val_feat_X.ndim == 3:
        val_feat_X = val_feat_X.reshape(val_feat_X.shape[0], -1)

    feat_accuracy = train_and_evaluate(dt_model, train_feat_X, train_feat_Y,
                                       val_feat_X, val_feat_Y)

    # Define the directory where you want to save the models
    save_dir = '/content/drive/MyDrive/Task1 Models/Full/'

    # Create the directory if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Save models to the specified directory in Google Drive
    joblib.dump(lr_model, os.path.join(save_dir, 'emoticon_model.pkl'))
    joblib.dump(svm_model, os.path.join(save_dir, 'text_seq_model.pkl'))
    joblib.dump(dt_model, os.path.join(save_dir, 'feature_model.pkl'))

    # Print the accuracies
    print(f"Emoticon Model (LR) Accuracy: {emoticon_accuracy}")
    print(f"Text Sequence Model (SVM) Accuracy: {seq_accuracy}")
    print(f"Feature Model (Decision Tree) Accuracy: {feat_accuracy}")

Emoticon Model (LR) Accuracy: 0.5153374233128835
Text Sequence Model (SVM) Accuracy: 0.5255623721881391
Feature Model (Decision Tree) Accuracy: 0.9631901840490797


ANOTHER INFERIOR APPROACH

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import joblib
import os

# Assuming you have load_data(), get_data_split(), preprocess_emoticon_data(), and train_and_evaluate() functions
def handle_invalid_values(data):
    data = np.array(data)
    # Replace infinities and NaNs with zero or any other strategy
    data[np.isinf(data)] = 0
    data[np.isnan(data)] = 0
    return data

def preprocess_emoticon_data(data, encoder=None):
    # Reshape the data to be 2D (each sample as a row)
    data = np.array(data).reshape(-1, 1)

    if encoder is None:
        encoder = OneHotEncoder(handle_unknown='ignore')
        data_encoded = encoder.fit_transform(data).toarray()
    else:
        data_encoded = encoder.transform(data).toarray()

    return data_encoded, encoder

def scale_data(train_data, val_data):
    scaler = StandardScaler()
    train_data_scaled = scaler.fit_transform(train_data)
    val_data_scaled = scaler.transform(val_data)
    return train_data_scaled, val_data_scaled

if __name__ == '__main__':
    # Load data
    (train_emoticon_X, train_emoticon_Y), (train_seq_X, train_seq_Y), (train_feat_X, train_feat_Y), \
    (val_emoticon_X, val_emoticon_Y), (val_seq_X, val_seq_Y), (val_feat_X, val_feat_Y) = load_data()

    # Define splits
    splits = [0.2, 0.4, 0.6, 0.8, 1.0]

    # Initialize encoder for emoticons
    encoder = None

    # Define models for all three datasets
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "SVM": SVC(),
        "Decision Tree": DecisionTreeClassifier()
    }

    for split in splits:
        print(f"Training with {int(split * 100)}% of the data...")

        # ------------------- Emoticon Data -------------------
        # Split and preprocess emoticon data
        train_emoticon_X_split, train_emoticon_Y_split = get_data_split(train_emoticon_X, train_emoticon_Y, split)
        train_emoticon_X_encoded, encoder = preprocess_emoticon_data(train_emoticon_X_split, encoder)
        val_emoticon_X_encoded, _ = preprocess_emoticon_data(val_emoticon_X, encoder)

        # Train and evaluate models on emoticon data
        accuracies = {"Emoticon": {}, "Text Sequence": {}, "Feature": {}}
        for model_name, model in models.items():
            accuracy = train_and_evaluate(model, train_emoticon_X_encoded, train_emoticon_Y_split,
                                          val_emoticon_X_encoded, val_emoticon_Y)
            accuracies["Emoticon"][model_name] = accuracy

        # ------------------- Text Sequence Data -------------------
        # Split and preprocess text sequence data
        train_seq_X_split, train_seq_Y_split = get_data_split(train_seq_X, train_seq_Y, split)
        train_seq_X_split = handle_invalid_values(train_seq_X_split)
        val_seq_X = handle_invalid_values(val_seq_X)

        # Scale sequence data
        train_seq_X_scaled, val_seq_X_scaled = scale_data(np.array(train_seq_X_split).reshape(-1, 1), np.array(val_seq_X).reshape(-1, 1))

        # Train models on text sequence data
        for model_name, model in models.items():
            accuracy = train_and_evaluate(model, train_seq_X_scaled, train_seq_Y_split, val_seq_X_scaled, val_seq_Y)
            accuracies["Text Sequence"][model_name] = accuracy

        # ------------------- Feature Data -------------------
        # Reshape features if 3D and split
        if train_feat_X.ndim == 3:
            train_feat_X = train_feat_X.reshape(train_feat_X.shape[0], -1)
        if val_feat_X.ndim == 3:
            val_feat_X = val_feat_X.reshape(val_feat_X.shape[0], -1)

        train_feat_X_split, train_feat_Y_split = get_data_split(train_feat_X, train_feat_Y, split)

        # Handle invalid values in feature data
        train_feat_X_split = handle_invalid_values(train_feat_X_split)
        val_feat_X = handle_invalid_values(val_feat_X)

        # Scale feature data
        train_feat_X_scaled, val_feat_X_scaled = scale_data(train_feat_X_split, val_feat_X)

        # Train models on feature data
        for model_name, model in models.items():
            accuracy = train_and_evaluate(model, train_feat_X_scaled, train_feat_Y_split, val_feat_X_scaled, val_feat_Y)
            accuracies["Feature"][model_name] = accuracy

        # Save models for each split
        model_save_path = f'/content/Task1 Models/{int(split * 100)}'
        os.makedirs(model_save_path, exist_ok=True)  # Create directory if it doesn't exist

        # Save all models (for emoticon, text sequence, and feature data)
        for model_name, model in models.items():
            joblib.dump(model, f'{model_save_path}/emoticon_model_{model_name}_{int(split * 100)}.pkl')
            joblib.dump(model, f'{model_save_path}/text_seq_model_{model_name}_{int(split * 100)}.pkl')
            joblib.dump(model, f'{model_save_path}/feature_model_{model_name}_{int(split * 100)}.pkl')

        # Print results
        print(f"Results with {int(split * 100)}% of the data:")
        print(f"Emoticon Model Accuracies: {accuracies['Emoticon']}")
        print(f"Text Sequence Model Accuracies: {accuracies['Text Sequence']}")
        print(f"Feature Model Accuracies: {accuracies['Feature']}")


Training with 20% of the data...
Results with 20% of the data:
Emoticon Model Accuracies: {'Logistic Regression': 0.5153374233128835, 'SVM': 0.5153374233128835, 'Decision Tree': 0.5153374233128835}
Text Sequence Model Accuracies: {'Logistic Regression': 0.5173824130879345, 'SVM': 0.49897750511247446, 'Decision Tree': 0.49284253578732107}
Feature Model Accuracies: {'Logistic Regression': 0.9529652351738241, 'SVM': 0.967280163599182, 'Decision Tree': 0.9059304703476483}
Training with 40% of the data...
Results with 40% of the data:
Emoticon Model Accuracies: {'Logistic Regression': 0.48466257668711654, 'SVM': 0.48466257668711654, 'Decision Tree': 0.48466257668711654}
Text Sequence Model Accuracies: {'Logistic Regression': 0.5030674846625767, 'SVM': 0.5071574642126789, 'Decision Tree': 0.50920245398773}
Feature Model Accuracies: {'Logistic Regression': 0.9652351738241309, 'SVM': 0.9754601226993865, 'Decision Tree': 0.9406952965235174}
Training with 60% of the data...
Results with 60% of t

**Below is LTSM neural net with utf-8 encoding for emoticon_dataset, very slow**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

# Convert emoji string into a list of Unicode (UTF-8) encoded integers
def utf8_encode_emoticon_data(emoticon_data, max_length=13):
    encoded_data = []
    for sequence in emoticon_data:
        utf8_encoded_sequence = [ord(emoji) for emoji in sequence]  # Convert each emoji to its Unicode code point
        encoded_data.append(utf8_encoded_sequence)
    padded_data = pad_sequences(encoded_data, maxlen=max_length, padding='post')
    return padded_data

def get_data_split(X, Y, split_ratio):
    if split_ratio >= 1.0:
        train_X, _, train_Y, _ = train_test_split(X, Y, test_size=0.1, random_state=42)
        return train_X, train_Y
    train_X, _, train_Y, _ = train_test_split(X, Y, test_size=1 - split_ratio, random_state=42)
    return train_X, train_Y

def build_lstm_model(max_length, vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))  # Embedding Layer
    model.add(SpatialDropout1D(0.2))  # Dropout Layer to prevent overfitting
    model.add(LSTM(100))  # LSTM Layer
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compile model
    return model

def train_and_evaluate(model, train_X, train_Y, val_X, val_Y):
    print(f"train_X shape: {train_X.shape}, val_X shape: {val_X.shape}")

    train_Y = np.array(train_Y)
    model.fit(train_X, train_Y, epochs=5, batch_size=32, verbose=1)  # Train for a few epochs

    # Predictions
    predictions = (model.predict(val_X) > 0.5).astype("int32")
    accuracy = accuracy_score(val_Y, predictions)
    return accuracy

def check_for_invalid_values(data):
    if not all(isinstance(item, str) for item in data):
        print("Data contains non-string values.")

if __name__ == '__main__':
    # Load your data here. Replace with your actual loading function.
    (train_emoticon_X, train_emoticon_Y), (train_seq_X, train_seq_Y), (train_feat_X, train_feat_Y), \
    (val_emoticon_X, val_emoticon_Y), (val_seq_X, val_seq_Y), (val_feat_X, val_feat_Y) = load_data()

    check_for_invalid_values(train_emoticon_X)
    check_for_invalid_values(val_emoticon_X)

    splits = [0.2, 0.4, 0.6, 0.8, 1.0]
    max_length = 13  # Adjust based on your data

    for split in splits:
        print(f"\nTraining with {int(split * 100)}% of the data...\n")

        train_emoticon_X_split, train_emoticon_Y_split = get_data_split(train_emoticon_X, train_emoticon_Y, split)

        # Preprocess emoji data by encoding them using UTF-8
        train_emoticon_X_encoded = utf8_encode_emoticon_data(train_emoticon_X_split, max_length=max_length)
        val_emoticon_X_encoded = utf8_encode_emoticon_data(val_emoticon_X, max_length=max_length)

        # Vocabulary size is based on the Unicode range (max value + 1)
        vocab_size = max([max(seq) for seq in train_emoticon_X_encoded]) + 1

        # Build the LSTM model
        model = build_lstm_model(max_length, vocab_size)

        # Train the model and evaluate accuracy
        accuracy = train_and_evaluate(model, train_emoticon_X_encoded, train_emoticon_Y_split,
                                      val_emoticon_X_encoded, val_emoticon_Y)
        print(f"Accuracy with {int(split * 100)}% of the data: {accuracy:.2f}")



Training with 20% of the data...

train_X shape: (1416, 13), val_X shape: (489, 13)
Epoch 1/5




[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 291ms/step - accuracy: 0.5218 - loss: 0.6924
Epoch 2/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 291ms/step - accuracy: 0.7254 - loss: 0.6440
Epoch 3/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 267ms/step - accuracy: 0.8568 - loss: 0.3881
Epoch 4/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 303ms/step - accuracy: 0.9012 - loss: 0.2719
Epoch 5/5
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 276ms/step - accuracy: 0.9050 - loss: 0.2508
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
Accuracy with 20% of the data: 0.87

Training with 40% of the data...

train_X shape: (2832, 13), val_X shape: (489, 13)
Epoch 1/5




[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 282ms/step - accuracy: 0.5391 - loss: 0.6864
Epoch 2/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 295ms/step - accuracy: 0.8286 - loss: 0.3850
Epoch 3/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 285ms/step - accuracy: 0.8969 - loss: 0.2576
Epoch 4/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 277ms/step - accuracy: 0.9014 - loss: 0.2328
Epoch 5/5
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 275ms/step - accuracy: 0.9206 - loss: 0.1939
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step
Accuracy with 40% of the data: 0.90

Training with 60% of the data...

train_X shape: (4248, 13), val_X shape: (489, 13)
Epoch 1/5




[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 278ms/step - accuracy: 0.5594 - loss: 0.6695
Epoch 2/5
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 314ms/step - accuracy: 0.8629 - loss: 0.3302
Epoch 3/5
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 277ms/step - accuracy: 0.9138 - loss: 0.2100
Epoch 4/5
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 284ms/step - accuracy: 0.9303 - loss: 0.1732
Epoch 5/5
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 277ms/step - accuracy: 0.9390 - loss: 0.1441
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
Accuracy with 60% of the data: 0.90

Training with 80% of the data...

train_X shape: (5664, 13), val_X shape: (489, 13)
Epoch 1/5




[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 275ms/step - accuracy: 0.6107 - loss: 0.6268
Epoch 2/5
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 296ms/step - accuracy: 0.8835 - loss: 0.2799
Epoch 3/5
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 279ms/step - accuracy: 0.9193 - loss: 0.1881
Epoch 4/5
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 279ms/step - accuracy: 0.9377 - loss: 0.1394
Epoch 5/5
[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 278ms/step - accuracy: 0.9570 - loss: 0.1093
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Accuracy with 80% of the data: 0.91

Training with 100% of the data...

train_X shape: (6372, 13), val_X shape: (489, 13)
Epoch 1/5




[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 289ms/step - accuracy: 0.6077 - loss: 0.6149
Epoch 2/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 278ms/step - accuracy: 0.8966 - loss: 0.2466
Epoch 3/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 293ms/step - accuracy: 0.9313 - loss: 0.1702
Epoch 4/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 281ms/step - accuracy: 0.9418 - loss: 0.1342
Epoch 5/5
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 284ms/step - accuracy: 0.9526 - loss: 0.1123
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step
Accuracy with 100% of the data: 0.97


**Below is LTSM neural net with utf-8 then one hot encoding for emoticon_dataset, very fast**

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Convert emoji string into a list of Unicode (UTF-8) encoded integers
def utf8_encode_emoticon_data(emoticon_data, max_length=13):
    encoded_data = []
    all_unique_emojis = set()

    for sequence in emoticon_data:
        utf8_encoded_sequence = [ord(emoji) for emoji in sequence]  # Convert each emoji to its Unicode code point
        encoded_data.append(utf8_encoded_sequence)
        all_unique_emojis.update(utf8_encoded_sequence)

    # Padding sequences to the same length
    padded_data = pad_sequences(encoded_data, maxlen=max_length, padding='post')

    # Return the padded sequences and the unique set of Unicode code points
    return padded_data, sorted(list(all_unique_emojis))

# Create one-hot encoding for sequences based on unique Unicode values
def one_hot_encode_sequences(padded_data, unique_emojis, max_length):
    vocab_size = len(unique_emojis)
    emoji_to_index = {emoji: idx for idx, emoji in enumerate(unique_emojis)}

    one_hot_encoded_data = np.zeros((len(padded_data), max_length, vocab_size), dtype='float32')

    for i, sequence in enumerate(padded_data):
        for j, emoji in enumerate(sequence):
            if emoji in emoji_to_index:
                one_hot_encoded_data[i, j, emoji_to_index[emoji]] = 1.0

    return one_hot_encoded_data

# Filter out sequences containing the missing data emoji (🛓) and filter the corresponding labels
def filter_sequences(emoticon_data, labels, placeholder_emoji='🛓'):
    filtered_sequences = []
    filtered_labels = []

    for i, sequence in enumerate(emoticon_data):
        if placeholder_emoji not in sequence:
            filtered_sequences.append(sequence)
            filtered_labels.append(labels[i])

    return filtered_sequences, filtered_labels

def get_data_split(X, Y, split_ratio):
    if split_ratio >= 1.0:
        train_X, _, train_Y, _ = train_test_split(X, Y, test_size=0.1, random_state=42)
        return train_X, train_Y
    train_X, _, train_Y, _ = train_test_split(X, Y, test_size=1 - split_ratio, random_state=42)
    return train_X, train_Y

def build_lstm_model(max_length, vocab_size):
    model = Sequential()
    model.add(SpatialDropout1D(0.2))  # Dropout Layer to prevent overfitting
    model.add(LSTM(100, input_shape=(max_length, vocab_size)))  # LSTM Layer with input_shape defined here
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compile model
    return model

def train_and_evaluate(model, train_X, train_Y, val_X, val_Y):
    print(f"train_X shape: {train_X.shape}, val_X shape: {val_X.shape}")

    train_Y = np.array(train_Y)
    model.fit(train_X, train_Y, epochs=5, batch_size=32, verbose=1)  # Train for a few epochs

    # Predictions
    predictions = (model.predict(val_X) > 0.5).astype("int32")
    accuracy = accuracy_score(val_Y, predictions)
    return accuracy

if __name__ == '__main__':
    # Assuming load_data function is already defined and loads the dataset
    (train_emoticon_X, train_emoticon_Y), (train_seq_X, train_seq_Y), (train_feat_X, train_feat_Y), \
    (val_emoticon_X, val_emoticon_Y), (val_seq_X, val_seq_Y), (val_feat_X, val_feat_Y) = load_data()

    # Filter out sequences that contain the 🛓 placeholder emoji along with corresponding labels
    train_emoticon_X_filtered, train_emoticon_Y_filtered = filter_sequences(train_emoticon_X, train_emoticon_Y, placeholder_emoji='🛓')
    val_emoticon_X_filtered, val_emoticon_Y_filtered = filter_sequences(val_emoticon_X, val_emoticon_Y, placeholder_emoji='🛓')

    splits = [0.2, 0.4, 0.6, 0.8, 1.0]
    max_length = 13  # Adjust based on your data

    for split in splits:
        print(f"\nTraining with {int(split * 100)}% of the data...\n")

        # Get the split of the filtered training data
        train_emoticon_X_split, train_emoticon_Y_split = get_data_split(train_emoticon_X_filtered, train_emoticon_Y_filtered, split)

        # Preprocess emoji data by encoding them using UTF-8
        train_emoticon_X_encoded, unique_emojis = utf8_encode_emoticon_data(train_emoticon_X_split, max_length=max_length)
        val_emoticon_X_encoded, _ = utf8_encode_emoticon_data(val_emoticon_X_filtered, max_length=max_length)

        # One-hot encode the emoji sequences
        train_emoticon_X_one_hot = one_hot_encode_sequences(train_emoticon_X_encoded, unique_emojis, max_length)
        val_emoticon_X_one_hot = one_hot_encode_sequences(val_emoticon_X_encoded, unique_emojis, max_length)

        # Build the LSTM model without an embedding layer (since we're using one-hot encoding)
        vocab_size = len(unique_emojis)
        model = build_lstm_model(max_length, vocab_size)

        # Train the model and evaluate accuracy
        accuracy = train_and_evaluate(model, train_emoticon_X_one_hot, train_emoticon_Y_split,
                                      val_emoticon_X_one_hot, val_emoticon_Y_filtered)
        print(f"Accuracy with {int(split * 100)}% of the data: {accuracy:.2f}")



Training with 20% of the data...

train_X shape: (1384, 13, 213), val_X shape: (475, 13, 213)
Epoch 1/5


  super().__init__(**kwargs)


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.5152 - loss: 0.6925
Epoch 2/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.5959 - loss: 0.6821
Epoch 3/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.6926 - loss: 0.6024
Epoch 4/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.7538 - loss: 0.4930
Epoch 5/5
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.8017 - loss: 0.4418
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
Accuracy with 20% of the data: 0.77

Training with 40% of the data...

train_X shape: (2768, 13, 213), val_X shape: (475, 13, 213)
Epoch 1/5


  super().__init__(**kwargs)


[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.5017 - loss: 0.6921
Epoch 2/5
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.6954 - loss: 0.6275
Epoch 3/5
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.7764 - loss: 0.4846
Epoch 4/5
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8201 - loss: 0.3894
Epoch 5/5
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.8416 - loss: 0.3672
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step
Accuracy with 40% of the data: 0.87

Training with 60% of the data...

train_X shape: (4152, 13, 213), val_X shape: (475, 13, 213)
Epoch 1/5


  super().__init__(**kwargs)


[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.5412 - loss: 0.6870
Epoch 2/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.7249 - loss: 0.5304
Epoch 3/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.7970 - loss: 0.4296
Epoch 4/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.8273 - loss: 0.3739
Epoch 5/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - accuracy: 0.8525 - loss: 0.3452
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Accuracy with 60% of the data: 0.92

Training with 80% of the data...

train_X shape: (5536, 13, 213), val_X shape: (475, 13, 213)
Epoch 1/5


  super().__init__(**kwargs)


[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - accuracy: 0.5412 - loss: 0.6809
Epoch 2/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.7671 - loss: 0.4970
Epoch 3/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.8207 - loss: 0.3948
Epoch 4/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.8381 - loss: 0.3726
Epoch 5/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.8396 - loss: 0.3500
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step
Accuracy with 80% of the data: 0.92

Training with 100% of the data...

train_X shape: (6228, 13, 213), val_X shape: (475, 13, 213)
Epoch 1/5


  super().__init__(**kwargs)


[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.5734 - loss: 0.6755
Epoch 2/5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.7922 - loss: 0.4656
Epoch 3/5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.8277 - loss: 0.3695
Epoch 4/5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.8521 - loss: 0.3448
Epoch 5/5
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.8531 - loss: 0.3277
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Accuracy with 100% of the data: 0.90


**Now to improve on the text sequence dataset.**

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, SpatialDropout1D, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Encode sequence data
def encode_sequence_data(sequence_data, max_length=50):
    encoded_data = []
    all_unique_numbers = set()

    for sequence in sequence_data:
        if isinstance(sequence, (str, int)):
            sequence_str = str(sequence)
            encoded_sequence = [int(char) for char in sequence_str if char.isdigit()]
            encoded_data.append(encoded_sequence)
            all_unique_numbers.update(encoded_sequence)
        else:
            print(f"Skipping invalid sequence: {sequence}")

    padded_data = pad_sequences(encoded_data, maxlen=max_length, padding='post')

    return padded_data, len(all_unique_numbers)

# Build the RNN model with LSTM or GRU
def build_rnn_model(input_length, vocab_size, use_gru=False):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size + 1, output_dim=128, input_length=input_length))
    model.add(SpatialDropout1D(0.3))  # Dropout after embedding layer

    if use_gru:
        model.add(GRU(128, return_sequences=True))
        model.add(GRU(128))
    else:
        model.add(LSTM(128, return_sequences=True))
        model.add(LSTM(128))

    model.add(Dropout(0.3))  # Dropout after LSTM/GRU layers
    model.add(Dense(64, activation='relu'))  # Dense layer before output
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

# Train and evaluate the model
def train_and_evaluate(model, train_X, train_Y, val_X, val_Y):
    print(f"train_X shape: {train_X.shape}, val_X shape: {val_X.shape}")

    train_Y = np.array(train_Y)
    val_Y = np.array(val_Y)

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

    # Train the model
    model.fit(train_X, train_Y, epochs=20, batch_size=32, validation_data=(val_X, val_Y),
              callbacks=[early_stopping, lr_scheduler], verbose=1)

    # Predict and calculate accuracy
    predictions = (model.predict(val_X) > 0.5).astype("int32")
    accuracy = accuracy_score(val_Y, predictions)

    return accuracy

# Main script
if __name__ == '__main__':
    # Load the datasets
    (train_emoticon_X, train_emoticon_Y), (train_seq_X, train_seq_Y), (train_feat_X, train_feat_Y), \
    (val_emoticon_X, val_emoticon_Y), (val_seq_X, val_seq_Y), (val_feat_X, val_feat_Y) = load_data()

    # Encode the sequence data
    max_length = 50
    train_seq_X_encoded, vocab_size = encode_sequence_data(train_seq_X, max_length=max_length)
    val_seq_X_encoded, _ = encode_sequence_data(val_seq_X, max_length=max_length)

    # Build and compile the model
    model = build_rnn_model(input_length=max_length, vocab_size=vocab_size)

    # Train and evaluate the model
    accuracy = train_and_evaluate(model, train_seq_X_encoded, train_seq_Y,
                                  val_seq_X_encoded, val_seq_Y)

    print(f"Validation Accuracy: {accuracy:.2f}")


train_X shape: (7080, 50), val_X shape: (489, 50)
Epoch 1/20




[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 223ms/step - accuracy: 0.5152 - loss: 0.6905 - val_accuracy: 0.6094 - val_loss: 0.6456 - learning_rate: 0.0010
Epoch 2/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 214ms/step - accuracy: 0.6206 - loss: 0.6482 - val_accuracy: 0.6524 - val_loss: 0.6366 - learning_rate: 0.0010
Epoch 3/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 209ms/step - accuracy: 0.6188 - loss: 0.6458 - val_accuracy: 0.6503 - val_loss: 0.6190 - learning_rate: 0.0010
Epoch 4/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 214ms/step - accuracy: 0.6379 - loss: 0.6369 - val_accuracy: 0.6708 - val_loss: 0.6149 - learning_rate: 0.0010
Epoch 5/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 214ms/step - accuracy: 0.6398 - loss: 0.6229 - val_accuracy: 0.6789 - val_loss: 0.6101 - learning_rate: 0.0010
Epoch 6/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

**TASK 2**

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Embedding, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define your functions to encode the data
def utf8_encode_emoticon_data(emoticon_data, max_length=13):
    encoded_data = []
    all_unique_emojis = set()

    for sequence in emoticon_data:
        utf8_encoded_sequence = [ord(emoji) for emoji in sequence]
        encoded_data.append(utf8_encoded_sequence)
        all_unique_emojis.update(utf8_encoded_sequence)

    padded_data = pad_sequences(encoded_data, maxlen=max_length, padding='post')
    return padded_data, sorted(list(all_unique_emojis))

def one_hot_encode_sequences(padded_data, unique_emojis, max_length):
    vocab_size = len(unique_emojis)
    emoji_to_index = {emoji: idx for idx, emoji in enumerate(unique_emojis)}

    one_hot_encoded_data = np.zeros((len(padded_data), max_length, vocab_size), dtype='float32')

    for i, sequence in enumerate(padded_data):
        for j, emoji in enumerate(sequence):
            if emoji in emoji_to_index:
                one_hot_encoded_data[i, j, emoji_to_index[emoji]] = 1.0

    return one_hot_encoded_data


def encode_sequence_data(sequence_data, max_length=50):
    encoded_data = []
    all_unique_numbers = set()

    for sequence in sequence_data:
        if isinstance(sequence, (str, int)):
            sequence_str = str(sequence)
            encoded_sequence = [int(char) for char in sequence_str if char.isdigit()]
            encoded_data.append(encoded_sequence)
            all_unique_numbers.update(encoded_sequence)
        else:
            print(f"Skipping invalid sequence: {sequence}")

    padded_data = pad_sequences(encoded_data, maxlen=max_length, padding='post')
    return padded_data, len(all_unique_numbers)

# Adjust the emoticon vocab size after encoding
def map_emojis_to_indices(data, emoji_to_index):
    mapped_data = []
    for sequence in data:
        mapped_sequence = [emoji_to_index.get(emoji, 0) for emoji in sequence]
        mapped_data.append(mapped_sequence)
    return np.array(mapped_data)

# Build the combined model
def build_combined_model(emoticon_vocab_size, emoticon_input_len, text_seq_input_len, feature_input_shape):
    # Input 1: Emoticon input
    emoticon_input = Input(shape=(emoticon_input_len,))
    x1 = Embedding(input_dim=emoticon_vocab_size, output_dim=128)(emoticon_input)
    x1 = SpatialDropout1D(0.2)(x1)
    x1 = LSTM(64)(x1)

    # Input 2: Text sequence input
    text_input = Input(shape=(text_seq_input_len,))
    x2 = Embedding(10000, 128)(text_input)
    x2 = SpatialDropout1D(0.2)(x2)
    x2 = LSTM(64)(x2)

    # Input 3: Feature input
    feature_input = Input(shape=feature_input_shape)
    x3 = Dense(64, activation='relu')(feature_input)

    # Concatenate all inputs
    concatenated = Concatenate()([x1, x2, x3])

    # Dense layers after concatenation
    x = Dense(64, activation='relu')(concatenated)
    x = Dense(32, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)

    # Create the model
    model = Model(inputs=[emoticon_input, text_input, feature_input], outputs=output)

    # Compile the model
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    return model


# Assume load_data() is defined and loads the datasets
(train_emoticon_X, train_emoticon_Y), (train_seq_X, train_seq_Y), (train_feat_X, train_feat_Y), \
(val_emoticon_X, val_emoticon_Y), (val_seq_X, val_seq_Y), (val_feat_X, val_feat_Y) = load_data()

# Encode emoticon data
train_emoticon_X, unique_emojis = utf8_encode_emoticon_data(train_emoticon_X)
val_emoticon_X, val_unique_emojis = utf8_encode_emoticon_data(val_emoticon_X)
unique_emojis = sorted(list(set(unique_emojis + val_unique_emojis)))
emoji_to_index = {emoji: idx for idx, emoji in enumerate(unique_emojis)}
train_emoticon_X = map_emojis_to_indices(train_emoticon_X, emoji_to_index)
val_emoticon_X = map_emojis_to_indices(val_emoticon_X, emoji_to_index)

# Encode sequence data
train_seq_X, _ = encode_sequence_data(train_seq_X)
val_seq_X, _ = encode_sequence_data(val_seq_X)

# Ensure feature input is correctly shaped
train_feat_X = np.array(train_feat_X)  # Shape (7080, 13, 768)
train_feat_X = train_feat_X.reshape(-1, 13 * 768)  # Flatten feature input for compatibility
val_feat_X = np.array(val_feat_X).reshape(-1, 13 * 768)

# Hyperparameters
emoticon_vocab_size = len(unique_emojis)  # Use the actual vocab size from unique emojis
emoticon_input_len = train_emoticon_X.shape[1]  # Shape (7080, 13)
text_seq_input_len = train_seq_X.shape[1]  # Shape (7080, 50)
feature_input_shape = (train_feat_X.shape[1],)  # Now (13 * 768,)

# Build the combined model
combined_model = build_combined_model(emoticon_vocab_size, emoticon_input_len, text_seq_input_len, feature_input_shape)

# Add data splits and accuracies tracking
data_splits = [0.2, 0.4, 0.6, 0.8, 1.0]
accuracies = []

# Loop over the data splits
for split in data_splits:
    # Calculate number of samples for this split
    split_size = int(len(train_emoticon_X) * split)

    # Prepare the training data by slicing according to split size
    train_emoticon_X_split = train_emoticon_X[:split_size]
    train_emoticon_Y_split = np.array(train_emoticon_Y[:split_size])

    train_seq_X_split = train_seq_X[:split_size]
    train_seq_Y_split = np.array(train_seq_Y[:split_size])

    train_feat_X_split = train_feat_X[:split_size]
    train_feat_Y_split = np.array(train_feat_Y[:split_size])
    val_feat_Y_split = np.array(val_feat_Y[:split_size])

    # Fit the combined model with the split data
    combined_model.fit(
        [train_emoticon_X_split, train_seq_X_split, train_feat_X_split],
        train_emoticon_Y_split,
        epochs=10,
        batch_size=32,
        verbose=1
    )

    # Evaluate the model on the validation sets
    accuracy = combined_model.evaluate(
    [val_emoticon_X, val_seq_X, val_feat_X],
    val_feat_Y,  # Assuming your validation labels are the same for all datasets
    verbose=1
    )[1]  # [1] gets accuracy
    accuracies.append(accuracy)

    # Print the result for this split
    print(f"Accuracy for {int(split * 100)}% of the data: {accuracy}")


Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 82ms/step - accuracy: 0.6731 - loss: 0.6121
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 69ms/step - accuracy: 0.9113 - loss: 0.2288
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 94ms/step - accuracy: 0.9497 - loss: 0.1266
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 76ms/step - accuracy: 0.9336 - loss: 0.1513
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 71ms/step - accuracy: 0.9778 - loss: 0.0786
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 104ms/step - accuracy: 0.9751 - loss: 0.0599
Epoch 7/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 92ms/step - accuracy: 0.9936 - loss: 0.0297
Epoch 8/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 72ms/step - accuracy: 0.9526 - loss: 0.1110
Epoch 9/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━