In [None]:
import pandas as pd #here just used to read csv file
from sklearn.preprocessing import MultiLabelBinarizer#basically transforms lists to binary indicator matrices ,presecnce or absence marked by 1 or 0 ((Try print(mlb.classes)))
from collections import Counter#counts for the frequency of all the tags and thus gives us top 10 tags
import ast#abstract syntax tree
from tensorflow.keras.preprocessing.text import Tokenizer#for text processing
from tensorflow.keras.preprocessing.sequence import pad_sequences #pads sequences to same length
import tensorflow as tf#just importing tensorflow
from tensorflow.keras.models import Model #used to create a model in Keras. It is the central component for defining and training neural networks.
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout, BatchNormalization, Attention
from tensorflow.keras.optimizers import RMSprop# uses an adaptive learning rate and root mean square propagation to improve training.
from tensorflow.keras.callbacks import EarlyStopping# callback that stops training when a monitored metric has stopped improving.
import time #Time info
import psutil #CPU info
import subprocess #GPU info
import warnings #handling
import ipywidgets as widgets #for UI componentsa
from IPython.display import display, HTML #for html and UI display
from sklearn.metrics import precision_score #For Precision


# Ignore warnings
warnings.filterwarnings('ignore')

# Function to get GPU memory usage
def get_gpu_memory():
    try:
        result = subprocess.check_output(
            ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'], encoding='utf-8'
        )
        gpu_memory = int(result.strip().split('\n')[0])
    except Exception as e:
        gpu_memory = 0
    return gpu_memory

# Function to get CPU memory usage
def get_cpu_memory():
    return psutil.virtual_memory().used / (1024 ** 3)  # Convert bytes to GB

# Start session
def start_session():
    # Record the start time and initial memory usage
    start_time = time.time()
    initial_cpu_memory = get_cpu_memory()
    initial_gpu_memory = get_gpu_memory()

    # Load the dataset with a specified encoding
    file_path = '/kaggle/input/150k-rows/150k.csv'  # Update this path as per your dataset location on Kaggle
    data = pd.read_csv(file_path, encoding='latin1')

    # Display the first few rows of the dataset
    display(data.iloc[:10, [0, -1]])

    # Calculate and display the number of rows and dataset size
    num_rows = len(data)
    dataset_size = data.memory_usage(index=True).sum() / (1024 ** 2)  # Convert bytes to MB
    print("\nBefore filtering:")
    print(f"Number of rows: {num_rows}")
    print(f"Dataset size: {dataset_size:.2f} MB")

    # Function to convert tag strings to lists
    def convert_tags(tag_string):
        try:
            return ast.literal_eval(tag_string)
        except (ValueError, SyntaxError):
            return []

    # Convert tag strings to lists, handling NaN values
    data['Tags'] = data['Tags'].fillna('[]').apply(convert_tags)

    # Extract the top 10 most frequent tags
    all_tags = [tag for tags in data['Tags'] for tag in tags]
    top_10_tags = [tag for tag, count in Counter(all_tags).most_common(10)]

    # Filter the dataset to include only questions with these top 10 tags
    data['Top_Tags'] = data['Tags'].apply(lambda tags: [tag for tag in tags if tag in top_10_tags])
    data = data[data['Top_Tags'].map(len) > 0]

    # Tokenize the questions
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data['Questions'])
    sequences = tokenizer.texts_to_sequences(data['Questions'])

    # Pad the sequences
    max_sequence_length = 150  # Increased sequence length
    X = pad_sequences(sequences, maxlen=max_sequence_length)

    # Convert tags to a binary format
    mlb = MultiLabelBinarizer(classes=top_10_tags)
    y = mlb.fit_transform(data['Top_Tags'])

    # Manually split the data into training (first 80%) and validation (next 20%) sets
    split_index = int(0.8 * len(X))
    X_train, X_val = X[:split_index], X[split_index:]
    y_train, y_val = y[:split_index], y[split_index:]

    # Define model parameters
    embedding_dim = 128
    gru_units = 64
    num_filters = 256
    kernel_size = 5
    dense_units = 64
    dropout_rate = 0.5061
    num_classes = len(top_10_tags)
    vocab_size = len(tokenizer.word_index) + 1

    # Define the input layer
    input_layer = Input(shape=(max_sequence_length,))

    # Embedding layer
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length)(input_layer)

    # Bidirectional GRU layer
    gru_layer = Bidirectional(GRU(units=gru_units, return_sequences=True))(embedding_layer)
    gru_layer = Dropout(dropout_rate)(gru_layer)
    gru_layer = BatchNormalization()(gru_layer)

    # Attention layer
    attention_layer = Attention()([gru_layer, gru_layer])

    # CNN layer
    cnn_layer = Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu')(embedding_layer)
    cnn_layer = GlobalMaxPooling1D()(cnn_layer)
    cnn_layer = Dropout(dropout_rate)(cnn_layer)
    cnn_layer = BatchNormalization()(cnn_layer)

    # Concatenate GRU and CNN layers
    concatenated_layer = Concatenate()([attention_layer[:, -1, :], cnn_layer])

    # Dense layer
    dense_layer = Dense(units=dense_units, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(concatenated_layer)
    dense_layer = Dropout(dropout_rate)(dense_layer)
    dense_layer = BatchNormalization()(dense_layer)

    # Output layer
    output_layer = Dense(units=num_classes, activation='sigmoid')(dense_layer)

    # Define the model
    model = Model(inputs=input_layer, outputs=output_layer)

    # Compile the model with RMSprop optimizer
    optimizer = RMSprop(learning_rate=0.001)  # Adjust learning rate if needed
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Print the model summary
    print()
    model.summary()
    print()

    # Train the model with Early Stopping
    batch_size = 32
    epochs = 10

    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), callbacks=[early_stopping])

    # Evaluate the model
    loss, accuracy = model.evaluate(X_val, y_val)

    # Calculate precision
    y_pred = model.predict(X_val)
    y_pred_binary = (y_pred > 0.5).astype(int)
    precision = precision_score(y_val, y_pred_binary, average='micro')

    # Display accuracy and precision in percentage
    print(f"\nValidation Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision * 100:.2f}%")


    # Record the end time and final memory usage
    end_time = time.time()
    final_cpu_memory = get_cpu_memory()
    final_gpu_memory = get_gpu_memory()

    # Calculate the execution time and memory usage
    execution_time = end_time - start_time
    cpu_memory_used = final_cpu_memory - initial_cpu_memory
    gpu_memory_used = final_gpu_memory - initial_gpu_memory
    
    # Calculate and display the number of rows and dataset size
    num_rows = len(data)
    dataset_size = data.memory_usage(index=True).sum() / (1024 ** 2)  # Convert bytes to MB
    print("\nAfter filtering:")
    print(f"Number of rows: {num_rows}")
    print(f"Dataset size: {dataset_size:.2f} MB")

    # Print the results
    print(f"\nExecution Time: {execution_time:.2f} seconds")
    print(f"CPU Memory Used: {cpu_memory_used:.2f} GB")
    print(f"GPU Memory Used: {gpu_memory_used} MB")

    # Display "TOP 10 PREDICTED TAGS" in bold
    print(" ")
    display(HTML("<b>TOP 10 PREDICTED TAGS</b>"))

    # Display top 10 tags in table format
    top_10_tags_html = "<table><tr><th>Rank</th><th>Tag</th></tr>"
    for i, tag in enumerate(top_10_tags, start=1):
        top_10_tags_html += f"<tr><td>{i}</td><td>{tag}</td></tr>"
    top_10_tags_html += "</table>"
    display(HTML(top_10_tags_html))
    
    import pickle

    # Save the model
    model.save("tag_predictor_model.h5")
    # Save tokenizer
    with open("tokenizer.pkl", "wb") as f:
        pickle.dump(tokenizer, f)
    # Save MultiLabelBinarizer
    with open("mlb.pkl", "wb") as f:
        pickle.dump(mlb, f)
    print("Model, tokenizer, and label binarizer saved successfully.")

    def predict_tags_fn(question, threshold=0.5):
        if question:
            sequence = tokenizer.texts_to_sequences([question])
            padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length)
            prediction = model.predict(padded_sequence)
            adjusted_threshold = np.percentile(prediction, 70)
            predicted_tags = mlb.inverse_transform(prediction > adjusted_threshold)
            return predicted_tags[0]
        return []


    return predict_tags_fn, top_10_tags, tokenizer, model

predict_tags_fn, top_10_tags, tokenizer, model = start_session()


2025-07-18 07:55:15.764447: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-18 07:55:15.764613: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-18 07:55:15.909244: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Unnamed: 0,Questions,Tags
0,I am very new to C# and I have a question. I d...,"[""c#"",""asp.net-mvc"",""if-statement""]"
1,"The code runs fine on React 16.8, yet freezes ...","[""reactjs"",""react-hooks"",""use-effect"",""js-cook..."
2,I have a python script with:\n\n```\nos.enviro...,"[""python"",""docker"",""environment-variables""]"
3,I have been trying to fetch the data from my F...,"[""android"",""firebase"",""listview"",""firebase-rea..."
4,I have a migration like this: \n\n```\nSchem...,"[""mysql"",""laravel""]"
5,I need to SELECT the 5 most recent notificatio...,"[""abap"",""opensql""]"
6,"I have a vector\n\n```\nmyVec <- c('1.2','asd'...","[""r"",""regex"",""grepl""]"
7,I stumbled upon the Solver function in my sear...,"[""excel"",""vba"",""random"",""solver""]"
8,I am facing a weird bug.\n\nHere is the code f...,"[""python"",""matplotlib"",""jupyter-lab""]"
9,I have some on-premise based frontend java ser...,"[""database"",""oracle"",""amazon-web-services"",""cl..."



Before filtering:
Number of rows: 150013
Dataset size: 3.43 MB




Epoch 1/10
[1m2032/2032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 35ms/step - accuracy: 0.5160 - loss: 0.5363 - val_accuracy: 0.7912 - val_loss: 0.1434
Epoch 2/10
[1m2032/2032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 35ms/step - accuracy: 0.7702 - loss: 0.1658 - val_accuracy: 0.8002 - val_loss: 0.1354
Epoch 3/10
[1m1905/2032[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m4s[0m 33ms/step - accuracy: 0.7890 - loss: 0.1549

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def predict_with_confidence(question, threshold=0.35, delta=0.08):
    sequence = tokenizer.texts_to_sequences([question])
    padded_sequence = pad_sequences(sequence, maxlen=150)
    prediction = model.predict(padded_sequence)[0]  # Only one input, get [0]
    
    tag_scores = {tag: score for tag, score in zip(top_10_tags, prediction)}
    sorted_tags = sorted(tag_scores.items(), key=lambda x: x[1], reverse=True)

    best_tag, best_score = sorted_tags[0]

    predicted_tags = [tag for tag, score in tag_scores.items()
                      if score >= threshold or (best_score - score <= delta)]

    return best_tag, best_score, predicted_tags, tag_scores


# Prediction loop
while True:
    question = input("\nEnter your question (or type 'exit' to quit): ").strip()
    if question.lower() == 'exit':
        print("Exiting prediction loop.")
        break

    best_tag, best_score, predicted_tags, tag_scores = predict_with_confidence(question)

    print(f"\nPredicted Top Tag: {best_tag} (Confidence: {best_score:.2f})")
    print(f"Other Relevant Tags: {', '.join(predicted_tags) if predicted_tags else 'None above threshold'}")
    print("All Tag Scores:")
    for tag, score in tag_scores.items():
        print(f"  {tag}: {score:.2f}")
