<a href="https://colab.research.google.com/github/SpyingPear/CapStone/blob/main/neural_network_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Network Slogan Task

This notebook implements:

1. Data preprocessing  
2. LSTM slogan generator (conditioned on industry)  
3. Slogan classifier (industry from slogan)  
4. Using the generator + classifier together

In [5]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Flatten, GlobalAveragePooling1D

## 1. Data preprocessing

In [6]:
# Adjust the path if needed (e.g. upload the CSV to Colab and use its path)
csv_path = "slogan-valid.csv"

df = pd.read_csv(csv_path)

# Keep only the columns we need
df = df[["output", "company", "industry"]].copy()
df.rename(columns={"output": "slogan", "company": "company_name"}, inplace=True)

# Drop rows with missing values in these columns
df.dropna(subset=["slogan", "company_name", "industry"], inplace=True)

df.head()

Unnamed: 0,slogan,company_name,industry
0,Taking Care of Small Business Technology,eftpos warehouse,computer hardware
1,Build World-Class Recreation Programs,welbi,"health, wellness and fitness"
2,Most Powerful Lead Generation Software for Mar...,optinmonster,internet
3,Hire quality freelancers for your job,twine.fm,internet
4,"Financial Advisers Norwich, Norfolk",mcb financial services ltd,financial services


In [7]:
# Encode industry labels as integers
label_encoder = LabelEncoder()
df["industry_id"] = label_encoder.fit_transform(df["industry"])
num_industries = len(label_encoder.classes_)

print("Number of samples:", len(df))
print("Number of industries:", num_industries)

Number of samples: 5346
Number of industries: 142


### Tokenisation and sequence encoding

In [8]:
# Slogans as raw text
slogans = df["slogan"].astype(str).str.lower().tolist()
company_names = df["company_name"].astype(str).str.lower().tolist()

# Tokeniser over slogans only (can be extended to include company names if required)
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(slogans)

# Integer-encoded slogan sequences (for classifier)
slogan_sequences = tokenizer.texts_to_sequences(slogans)

# Choose a max length for padded sequences (for classifier)
max_len_clf = max(len(seq) for seq in slogan_sequences)
X_slogans = pad_sequences(slogan_sequences, maxlen=max_len_clf, padding="post")

y_industry = df["industry_id"].values

vocab_size = min(max_words, len(tokenizer.word_index) + 1)

print("Vocab size:", vocab_size)
print("Max slogan length (classifier):", max_len_clf)

Vocab size: 6103
Max slogan length (classifier): 11


## 2. LSTM slogan generator

In [9]:
# Build training data for the generator:
# For each slogan we create (prefix -> next_word) pairs.

gen_input_sequences = []
gen_industries = []
gen_targets = []

# Limit sequence length for the generator to keep the model light
max_len_gen = 10

for seq, ind_id in zip(slogan_sequences, df["industry_id"].values):
    # Skip very short slogans
    if len(seq) < 2:
        continue
    for i in range(1, len(seq)):
        in_seq = seq[:i]
        target_word = seq[i]
        # Pad/truncate input sequence
        in_seq_padded = pad_sequences([in_seq], maxlen=max_len_gen, padding="pre", truncating="pre")[0]
        gen_input_sequences.append(in_seq_padded)
        gen_industries.append(ind_id)
        gen_targets.append(target_word)

gen_input_sequences = np.array(gen_input_sequences)
gen_industries = np.array(gen_industries)
gen_targets = np.array(gen_targets)

print("Generator samples:", gen_input_sequences.shape[0])
print("Generator input sequence length:", gen_input_sequences.shape[1])

Generator samples: 22919
Generator input sequence length: 10


In [10]:
embedding_dim = 64
lstm_units = 128
industry_embedding_dim = 16

# Text sequence input
text_input = Input(shape=(max_len_gen,))
x = Embedding(vocab_size, embedding_dim, input_length=max_len_gen)(text_input)
x = LSTM(lstm_units)(x)

# Industry input (integer id)
industry_input = Input(shape=(1,))
ind_emb = Embedding(num_industries, industry_embedding_dim, input_length=1)(industry_input)
ind_emb = Flatten()(ind_emb)

# Combine text and industry representations
x = Concatenate()([x, ind_emb])
x = Dense(128, activation="relu")(x)
output_word = Dense(vocab_size, activation="softmax")(x)

generator_model = Model([text_input, industry_input], output_word)
generator_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

generator_model.summary()



In [11]:
# Train the generator model
# Adjust epochs and batch_size depending on available compute
gen_epochs = 5
gen_batch_size = 128

history_gen = generator_model.fit(
    [gen_input_sequences, gen_industries],
    gen_targets,
    epochs=gen_epochs,
    batch_size=gen_batch_size,
    validation_split=0.1,
    verbose=1
)

Epoch 1/5
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 46ms/step - loss: 7.9805 - val_loss: 7.4544
Epoch 2/5
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 47ms/step - loss: 7.1363 - val_loss: 7.5380
Epoch 3/5
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 44ms/step - loss: 7.0252 - val_loss: 7.6175
Epoch 4/5
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 47ms/step - loss: 6.8921 - val_loss: 7.5609
Epoch 5/5
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 42ms/step - loss: 6.7281 - val_loss: 7.6155


In [12]:
# Helper to decode words from indices
index_to_word = {idx: word for word, idx in tokenizer.word_index.items() if idx < vocab_size}

def generate_slogan(industry_name, seed_text="", num_words=10):
    if industry_name not in label_encoder.classes_:
        raise ValueError("Unknown industry name.")
    ind_id = int(label_encoder.transform([industry_name])[0])

    text = seed_text.strip().lower()
    for _ in range(num_words):
        seq = tokenizer.texts_to_sequences([text])[0]
        seq = pad_sequences([seq], maxlen=max_len_gen, padding="pre", truncating="pre")
        pred_probs = generator_model.predict([seq, np.array([[ind_id]])], verbose=0)[0]
        next_id = int(np.argmax(pred_probs))
        if next_id == 0:
            break
        next_word = index_to_word.get(next_id)
        if not next_word:
            break
        text = (text + " " + next_word).strip()
    return text

In [13]:
# Example: generate slogans for a few industries
example_industries = list(label_encoder.classes_[:5])

for ind in example_industries:
    print("Industry:", ind)
    print("Generated:", generate_slogan(industry_name=ind, seed_text="", num_words=8))
    print()

Industry: accounting
Generated: in the the the the the the the

Industry: airlines/aviation
Generated: in the the the the the the the

Industry: alternative medicine
Generated: in the the the the the the the

Industry: animation
Generated: in the the the the the the the

Industry: apparel & fashion
Generated: in the the the the the the the



## 3. Slogan classifier

In [14]:
# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X_slogans, y_industry, test_size=0.2, random_state=42, stratify=y_industry
)

embedding_dim_clf = 64

clf_input = Input(shape=(max_len_clf,))
c = Embedding(vocab_size, embedding_dim_clf, input_length=max_len_clf)(clf_input)
c = GlobalAveragePooling1D()(c)
c = Dense(128, activation="relu")(c)
clf_output = Dense(num_industries, activation="softmax")(c)

classifier_model = Model(clf_input, clf_output)
classifier_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

classifier_model.summary()

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
clf_epochs = 5
clf_batch_size = 64

history_clf = classifier_model.fit(
    X_train, y_train,
    epochs=clf_epochs,
    batch_size=clf_batch_size,
    validation_split=0.1,
    verbose=1
)

In [None]:
loss, acc = classifier_model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {acc:.3f}")

## 4. Generator + classifier together

In [None]:
def classify_slogan(text):
    seq = tokenizer.texts_to_sequences([text.lower()])[0]
    seq_padded = pad_sequences([seq], maxlen=max_len_clf, padding="post", truncating="post")
    probs = classifier_model.predict(seq_padded, verbose=0)[0]
    pred_id = int(np.argmax(probs))
    return label_encoder.inverse_transform([pred_id])[0], probs[pred_id]

# Generate and classify slogans for a few industries
for ind in example_industries:
    gen = generate_slogan(industry_name=ind, seed_text="", num_words=8)
    pred_ind, confidence = classify_slogan(gen)
    print("Target industry:", ind)
    print("Generated slogan:", gen)
    print("Classifier prediction:", pred_ind, f"(confidence {confidence:.2f})")
    print()