In [1]:
from google.colab import drive
drive.mount('/content/drive')
file_path = "/content/drive/My Drive/Questions.csv"
import pandas as pd
questions = pd.read_csv(file_path, encoding='latin-1')
questions['Tags'] = questions['Tags'].str.split(' ')


Mounted at /content/drive


KeyError: 'Tags'

In [2]:
file_path = "/content/drive/My Drive/Questions.csv"
questions = pd.read_csv(file_path, encoding='latin-1')
print(questions.columns)

Index(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title',
       'Body'],
      dtype='object')


In [3]:
tags_df = pd.read_csv("/content/drive/My Drive/Tags.csv")
print(tags_df.head())
print(tags_df.info())


   Id             Tag
0  80            flex
1  80  actionscript-3
2  80             air
3  90             svn
4  90     tortoisesvn
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3750994 entries, 0 to 3750993
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Tag     object
dtypes: int64(1), object(1)
memory usage: 57.2+ MB
None


In [4]:
questions_with_tags = pd.merge(questions, tags_df, on='Id', how='left')
questions_with_tags = questions_with_tags.groupby('Id').agg({
    'OwnerUserId': 'first',
    'CreationDate': 'first',
    'ClosedDate': 'first',
    'Score': 'first',
    'Title': 'first',
    'Body': 'first',
    'Tag': lambda x: list(x)
}).reset_index()
questions_with_tags.rename(columns={'Tag': 'Tags'}, inplace=True)
print(questions_with_tags.head())


    Id  OwnerUserId          CreationDate            ClosedDate  Score  \
0   80         26.0  2008-08-01T13:57:07Z                  None     26   
1   90         58.0  2008-08-01T14:41:24Z  2012-12-26T03:45:49Z    144   
2  120         83.0  2008-08-01T15:50:08Z                  None     21   
3  180    2089740.0  2008-08-01T18:42:19Z                  None     53   
4  260         91.0  2008-08-01T23:22:08Z                  None     49   

                                               Title  \
0  SQLStatement.execute() - multiple queries in o...   
1  Good branching and merging tutorials for Torto...   
2                                  ASP.NET Site Maps   
3                 Function for creating color wheels   
4  Adding scripting functionality to .NET applica...   

                                                Body  \
0  <p>I've written a database generation script i...   
1  <p>Are there any really good tutorials explain...   
2  <p>Has anyone got experience creating <strong>.

In [5]:
all_tags = [tag for tags in questions_with_tags['Tags'] for tag in tags]
top_10_tags = pd.Series(all_tags).value_counts().head(10).index.tolist()
questions_with_tags['Tags'] = questions_with_tags['Tags'].apply(lambda tags: [tag for tag in tags if tag in top_10_tags])



In [6]:
X = questions_with_tags['Title'] + " " + questions_with_tags['Body']
y = questions_with_tags['Tags']


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(y)
print("Sample encoded labels:\n", y_encoded[:5])
print("Classes:\n", mlb.classes_)


Sample encoded labels:
 [[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]]
Classes:
 ['android' 'c#' 'c++' 'html' 'ios' 'java' 'javascript' 'jquery' 'php'
 'python']


In [8]:
from sklearn.model_selection import train_test_split

# (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")


Training samples: 1011372
Validation samples: 252844


In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 128
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_SEQUENCE_LENGTH)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_SEQUENCE_LENGTH)

print("Sample tokenized and padded sequence:\n", X_train_seq[0])


Sample tokenized and padded sequence:
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0   35    7   42   15    7 2557 4742   10    9    3   91   68
  956 4001  210   16   30    3    9    1   31   13    3    5   42  126
  210   31   15 1506   25    5   75   15 2557  233    7  270  247  151
  103    1]


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, GlobalMaxPool1D

EMBEDDING_DIM = 100
model = Sequential([
    Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPool1D(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(mlb.classes_), activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()




In [11]:
model = Sequential([
    Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPool1D(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(mlb.classes_), activation='sigmoid')
])



In [12]:
model.build(input_shape=(None, MAX_SEQUENCE_LENGTH))


In [13]:
model.summary()


In [14]:
print("Shape of training data:", X_train_seq.shape)
print("Shape of validation data:", X_val_seq.shape)


Shape of training data: (1011372, 128)
Shape of validation data: (252844, 128)


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, GlobalMaxPool1D
EMBEDDING_DIM = 100
model = Sequential([
    Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPool1D(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(mlb.classes_), activation='sigmoid')
])

model.build(input_shape=(None, MAX_SEQUENCE_LENGTH))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [16]:
history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=1,
    batch_size=64,
    verbose=1
)


[1m15803/15803[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6051s[0m 383ms/step - accuracy: 0.3939 - loss: 0.1296 - val_accuracy: 0.4630 - val_loss: 0.0887


In [18]:
y_pred = model.predict(X_val_seq)
y_pred_binary = (y_pred > 0.5).astype(int)



[1m7902/7902[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 47ms/step


In [19]:
from sklearn.metrics import classification_report, hamming_loss

# Hamming Loss
print(f"Hamming Loss: {hamming_loss(y_val, y_pred_binary)}")

# Classification Report
print("Classification Report:\n")
print(classification_report(y_val, y_pred_binary, target_names=mlb.classes_))


Hamming Loss: 0.0354475486861464
Classification Report:

              precision    recall  f1-score   support

     android       0.91      0.87      0.89     18120
          c#       0.73      0.60      0.65     20213
         c++       0.78      0.61      0.68      9680
        html       0.64      0.27      0.38     11938
         ios       0.68      0.71      0.69      9413
        java       0.79      0.61      0.69     22993
  javascript       0.73      0.52      0.61     25073
      jquery       0.80      0.66      0.72     15827
         php       0.81      0.73      0.77     19922
      python       0.85      0.76      0.80     12927

   micro avg       0.78      0.64      0.70    166106
   macro avg       0.77      0.63      0.69    166106
weighted avg       0.78      0.64      0.69    166106
 samples avg       0.39      0.37      0.38    166106



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
print(history.history)

{'accuracy': [0.43771034479141235], 'loss': [0.10585333406925201], 'val_accuracy': [0.46295344829559326], 'val_loss': [0.08866622298955917]}
