# Model Training

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
import zipfile
zip_file_path = "/content/stacksample.zip"
extract_to = "/content/stacksample"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

In [4]:
def clean_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\W+', ' ', text)
    return text.lower()

In [5]:
tags = pd.read_csv('/content/stacksample/Tags.csv', encoding='ISO-8859-1')
questions = pd.read_csv(r'/content/stacksample/Questions.csv', encoding='ISO-8859-1')

In [6]:
tags = tags.dropna()

In [None]:
# Map Tags to Questions using 'Id'
questions['Tag'] = questions['Id'].map(tags.groupby('Id')['Tag'].apply(list))
questions['Tag'] = questions['Tag'].apply(lambda x: x if isinstance(x, list) else [])
questions = questions.dropna(subset=['Tag'])  # Drop rows with no tags

# top 20 common tags only
from collections import Counter
tag_counter = Counter([tag for tags in questions['Tag'] for tag in tags])
top_tags = [tag for tag, _ in tag_counter.most_common(20)]
questions['Tag'] = questions['Tag'].apply(lambda tags: [tag for tag in tags if tag in top_tags])
questions = questions[questions['Tag'].apply(len) > 0]

# Clean Questions
questions['Body'] = questions['Body'].apply(clean_text)

# Encode Tags
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(questions['Tag'])

# Tokenize and Pad Sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(questions['Body'])
X = tokenizer.texts_to_sequences(questions['Body'])
X = pad_sequences(X, maxlen=500)

# Split Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=500),
    LSTM(64, return_sequences=False),
    Dense(128, activation='relu'),
    Dense(y.shape[1], activation='sigmoid')  # Multi-label output
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))
model.save('/content/multilabel_model.h5')

# Save Tokenizer and MultiLabelBinarizer
import pickle
with open('/content/tokenizer.pkl', 'wb') as token_file:
    pickle.dump(tokenizer, token_file)
with open('/content/mlb.pkl', 'wb') as mlb_file:
    pickle.dump(mlb, mlb_file)



Epoch 1/5
[1m21275/21275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 24ms/step - accuracy: 0.5274 - loss: 0.1276 - val_accuracy: 0.7299 - val_loss: 0.0711
Epoch 2/5
[1m21275/21275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m562s[0m 24ms/step - accuracy: 0.7373 - loss: 0.0679 - val_accuracy: 0.7387 - val_loss: 0.0685
Epoch 3/5
[1m21275/21275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m543s[0m 26ms/step - accuracy: 0.7497 - loss: 0.0635 - val_accuracy: 0.7389 - val_loss: 0.0676
Epoch 4/5
[1m21275/21275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 24ms/step - accuracy: 0.7582 - loss: 0.0607 - val_accuracy: 0.7401 - val_loss: 0.0681
Epoch 5/5
[1m21275/21275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m569s[0m 25ms/step - accuracy: 0.7655 - loss: 0.0584 - val_accuracy: 0.7366 - val_loss: 0.0690




In [None]:
# !pip install --upgrade pandas
# !pip install numpy
# !pip install tensorflow scikit-learn

# Using the model for predicting tags

In [21]:
print(top_tags)
print(len(top_tags))

['javascript', 'java', 'c#', 'php', 'android', 'jquery', 'python', 'html', 'c++', 'ios', 'mysql', 'css', 'sql', 'asp.net', 'objective-c', 'ruby-on-rails', '.net', 'c', 'iphone', 'angularjs']
20


In [10]:
import re
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
with open('/content/tokenizer.pkl', 'rb') as token_file:
    tokenizer = pickle.load(token_file)
with open('/content/mlb.pkl', 'rb') as mlb_file:
    mlb = pickle.load(mlb_file)

In [12]:
model = load_model('multilabel_model.h5')



In [13]:
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'\W+', ' ', text)
    text = text.lower()
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=500)
    return padded_sequence

In [14]:
def predict_tags(text, threshold=0.5):
    processed_text = preprocess_text(text)
    predictions = model.predict(processed_text)
    predicted_tags = (predictions > threshold).astype(int)
    return mlb.inverse_transform(predicted_tags)

### Predictions

In [31]:
new_question = "How would you implement a smart pointer to manage memory dynamically?"
tags = predict_tags(new_question)
print("Predicted Tags:", tags)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Predicted Tags: [('c++',)]


In [29]:
new_question = "How would you implement a decorator to measure the execution time of any function it wraps?"
tags = predict_tags(new_question)
print("Predicted Tags:", tags)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
Predicted Tags: [('python',)]


In [20]:
new_question = "What is PIP?"
tags = predict_tags(new_question)
print("Predicted Tags:", tags)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
Predicted Tags: [('python',)]


In [32]:
new_question = "How would you write a query to retrieve the top N records from a table based on a specific column value"
tags = predict_tags(new_question)
print("Predicted Tags:", tags)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Predicted Tags: [('sql',)]


In [33]:
new_question = "How would you secure form submissions and prevent SQL injection vulnerabilities?"
tags = predict_tags(new_question)
print("Predicted Tags:", tags)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
Predicted Tags: [('sql',)]


In [34]:
new_question = "How would you implement a RecyclerView to display a dynamic list of items with multiple view types?"
tags = predict_tags(new_question)
print("Predicted Tags:", tags)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Predicted Tags: [('android',)]


In [36]:
new_question = "How would you create a custom directive in a framework to bind dynamic data and manipulate the DOM while maintaining modularity and reusability?"
tags = predict_tags(new_question)
print("Predicted Tags:", tags)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
Predicted Tags: [('angularjs',)]
