In [60]:
import pandas as pd
import re
from collections import Counter

In [61]:
keras = pd.read_csv('../Dataset/Keras_Posts.csv')
tensorflow = pd.read_csv('../Dataset/Tensorflow_Posts.csv')
pytorch = pd.read_csv('../Dataset/Pytorch_Posts.csv')

# Merge these dataframes
df = pd.concat([keras, tensorflow, pytorch], ignore_index=True)

In [62]:
# Extract tags and count occurrences
tags = [re.findall(r'<([^>]*)>', tag) for tag in df['Tags']]
tag_counts = Counter([tag for sublist in tags for tag in sublist])

# Sort tag counts in descending order
sorted_counts = sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

# Print sorted tag counts
for tag, count in sorted_counts:
    print(f"{tag}: {count}")

python: 4641
pytorch: 3544
tensorflow: 2985
deep-learning: 1059
machine-learning: 835
keras: 737
neural-network: 556
conv-neural-network: 480
tensorflow2.0: 476
python-3.x: 407
tensor: 391
numpy: 366
lstm: 201
tensorflow-datasets: 191
huggingface-transformers: 183
tensorflow-lite: 179
nlp: 166
computer-vision: 161
gpu: 157
tensorflow.js: 152
torch: 146
google-colaboratory: 125
pytorch-lightning: 116
object-detection: 112
pandas: 108
pytorch-dataloader: 104
javascript: 98
loss-function: 97
bert-language-model: 97
recurrent-neural-network: 92
torchvision: 91
classification: 90
artificial-intelligence: 82
image-processing: 79
dataset: 77
scikit-learn: 75
autoencoder: 75
opencv: 68
jupyter-notebook: 68
autograd: 66
object-detection-api: 64
android: 64
c++: 64
tensorflow-federated: 61
reinforcement-learning: 60
arrays: 60
image: 58
mnist: 57
generative-adversarial-network: 56
anaconda: 54
onnx: 52
image-classification: 51
dataloader: 51
tf.keras: 49
google-cloud-platform: 49
training-data: 

In [63]:
# Give CSV for the Sorted Tag Counts
df = pd.DataFrame(sorted_counts, columns=['Tag', 'Count'])
df.to_csv('../Dataset/Tag_Counts.csv', index=False)

## Taxonomy

* Model
    * Missing/Redundant/Wrong Layer
    * Layer Properties
    * Activation Function
* Tensor & Inputs
    * Wrong Tensor Shape
    * Wrong Input
* Training
    * Hyperparameters
    * Loss Function
    * Preprocessing of Training Data
    * Optimiser
    * Training Data Quality
    * Training Process
* GPU
* API

### Types of Networks covered

* Convolutional Neural Network (CNN): 480
* Long Short-Term Memory (LSTM): 201
* Recurrent Neural Network (RNN): 92
* Generative Adversarial Network (GAN): 56
* Autoencoder: 75
* Siamese Network: 9
* Graph Neural Network (GNN): 5
* Multilayer Perceptron (MLP): 11
* Self-Attention: 3

### Relevant Tags for Different Types of Bugs

* Tensor: 391
* NumPy: 366
* GPU: 157
* Loss Function: 97
* Dataset: 77
* Arrays: 60
* Training Data: 49
* NVIDIA: 44
* CUDA: 43
* Optimization: 39
* Cross Entropy: 36
* Backpropogation: 33
* Model: 30
* Gradient: 30
* Data Augmentation: 25
* Loss: 25
* Reshape: 23
* Activation Function: 19
* Performance: 18
* Dropout: 17
* Type Error: 14
* Runtime Error: 13
* Layer: 13
* Memory: 13
* Out of Memory: 12
* Normalization: 12
* Dimensions: 10
* Learning Rate: 9
* CPU: 7
* NaN: 7
* Memory Leaks: 6
* Hyperparameters: 5
* Segmentation Fault: 5
* Initialization: 5
* Imbalanced Data: 3

### Bugs and their Tags

Model: Gradient, Layer, Model, Activation Function

Tensor & Input: Tensor

Training: Loss Function, Training Data, Optimization, Cross Entropy, Backpropagation, Loss, Data Augmentation, Performance, Dropout, Learning Rate, Hyperparameters, Initialization, Imbalanced Data, Runtime Error, Segmentation Fault, NaN

GPU: GPU, NVIDIA, CUDA, Out of Memory, Memory Leaks

API: Type Error, Value Error, Attribute Error, Import Error, Compiler Error, Syntax Error, Module Not Found Error

In [64]:
keras = pd.read_csv('../Dataset/Keras_Posts.csv')
tensorflow = pd.read_csv('../Dataset/Tensorflow_Posts.csv')
pytorch = pd.read_csv('../Dataset/Pytorch_Posts.csv')

# Merge these dataframes
df = pd.concat([keras, tensorflow, pytorch], ignore_index=True)

# Remove <> from the tags
df['Spaced_Tags'] = df['Tags'].str.replace(r'<', ' ')
df['Spaced_Tags'] = df['Tags'].str.replace(r'>', ' ')

# Drop Unnamed: 3 column and rename Unnamed: 0 to links
df.drop(['Unnamed: 3'], axis=1, inplace=True)
df.rename(columns={'Unnamed: 0': 'Links'}, inplace=True)

In [65]:
# Create a new column ID
df['Id'] = df.index
for index, row in df.iterrows():
    df.at[index, 'Id'] = df.at[index, 'Links'].split('/')[-1]

In [66]:
def get_posts(tags, df):
    condition = False
    for tag in tags:
        condition = condition | df['Spaced_Tags'].str.contains(r'\b{}\b'.format(tag))
    return df[condition]

model_bugs = get_posts(['layer', 'model', 'activation-function'], df)
tensor_bugs = get_posts(['tensor'], df)
training_bugs = get_posts(['loss-function', 'training-data', 'optimization', 'loss', 'data-augmentation', 'performance', 'learning-rate', 'hyperparameters', 'initialization', 'imbalanced-data', 'nan'], df)
gpu_bugs = get_posts(['gpu', 'nvidia', 'cuda'], df)
api_bugs = get_posts(['typeerror', 'valueerror', 'attributeerror', 'importerror', 'compilererrors', 'syntaxerror', 'modulenotfounderror'], df)

In [67]:
print (f'Number of Model Bugs: {len(model_bugs)}')
print (f'Number of Tensor & Input Bugs: {len(tensor_bugs)}')
print (f'Number of Training Bugs: {len(training_bugs)}')
print (f'Number of GPU Bugs: {len(gpu_bugs)}')
print (f'Number of API Bugs: {len(api_bugs)}')

Number of Model Bugs: 280
Number of Tensor & Input Bugs: 391
Number of Training Bugs: 274
Number of GPU Bugs: 239
Number of API Bugs: 58


In [68]:
# Common posts between model_bugs and tensor_input
model_tensor = pd.merge(model_bugs, tensor_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between model_bugs and training_bugs
model_training = pd.merge(model_bugs, training_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between model_bugs and gpu_bugs
model_gpu = pd.merge(model_bugs, gpu_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between model_bugs and api_bugs
model_api = pd.merge(model_bugs, api_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between tensor_bugs and training_bugs
tensor_training = pd.merge(tensor_bugs, training_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between tensor_bugs and gpu_bugs
tensor_gpu = pd.merge(tensor_bugs, gpu_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between tensor_bugs and api_bugs
tensor_api = pd.merge(tensor_bugs, api_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between training_bugs and gpu_bugs
training_gpu = pd.merge(training_bugs, gpu_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between training_bugs and api_bugs
training_api = pd.merge(training_bugs, api_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

# Common posts between gpu_bugs and api_bugs
gpu_api = pd.merge(gpu_bugs, api_bugs, how='inner', on=['Id', 'Links', 'Title', 'Tags', 'Spaced_Tags'])

print (f'Number of Common Posts between Model & Tensor: {len(model_tensor)}')
print (f'Number of Common Posts between Model & Training: {len(model_training)}')
print (f'Number of Common Posts between Model & GPU: {len(model_gpu)}')
print (f'Number of Common Posts between Model & API: {len(model_api)}')
print (f'Number of Common Posts between Tensor & Training: {len(tensor_training)}')
print (f'Number of Common Posts between Tensor & GPU: {len(tensor_gpu)}')
print (f'Number of Common Posts between Tensor & API: {len(tensor_api)}')
print (f'Number of Common Posts between Training & GPU: {len(training_gpu)}')
print (f'Number of Common Posts between Training & API: {len(training_api)}')
print (f'Number of Common Posts between GPU & API: {len(gpu_api)}')

Number of Common Posts between Model & Tensor: 7
Number of Common Posts between Model & Training: 14
Number of Common Posts between Model & GPU: 2
Number of Common Posts between Model & API: 1
Number of Common Posts between Tensor & Training: 8
Number of Common Posts between Tensor & GPU: 9
Number of Common Posts between Tensor & API: 0
Number of Common Posts between Training & GPU: 3
Number of Common Posts between Training & API: 3
Number of Common Posts between GPU & API: 0


In [69]:
# Merge all the dataframes
df = pd.concat([model_bugs, tensor_bugs, training_bugs, gpu_bugs, api_bugs], ignore_index=True)
df.drop_duplicates(subset=['Id'], keep=False, inplace=True)

In [70]:
model_bugs = get_posts(['layer', 'model', 'activation-function'], df)
tensor_bugs = get_posts(['tensor'], df)
training_bugs = get_posts(['loss-function', 'training-data', 'optimization', 'loss', 'data-augmentation', 'performance', 'learning-rate', 'hyperparameters', 'initialization', 'imbalanced-data', 'nan'], df)
gpu_bugs = get_posts(['gpu', 'nvidia', 'cuda'], df)
api_bugs = get_posts(['typeerror', 'valueerror', 'attributeerror', 'importerror', 'compilererrors', 'syntaxerror', 'modulenotfounderror'], df)

In [71]:
print (f'Number of Unique Model Bugs: {len(model_bugs)}')
print (f'Number of Unique Tensor & Input Bugs: {len(tensor_bugs)}')
print (f'Number of Unique Training Bugs: {len(training_bugs)}')
print (f'Number of Unique GPU Bugs: {len(gpu_bugs)}')
print (f'Number of Unique API Bugs: {len(api_bugs)}')

Number of Unique Model Bugs: 258
Number of Unique Tensor & Input Bugs: 369
Number of Unique Training Bugs: 248
Number of Unique GPU Bugs: 225
Number of Unique API Bugs: 54


In [72]:
total_len = len(model_bugs) + len(tensor_bugs) + len(training_bugs) + len(gpu_bugs) + len(api_bugs)
print (f'Total Number of Unique Bugs: {total_len}')
print (f'Percentage of Model Bugs: {round((len(model_bugs)/total_len)*100, 2)}%')
print (f'Percentage of Tensor & Input Bugs: {round((len(tensor_bugs)/total_len)*100, 2)}%')
print (f'Percentage of Training Bugs: {round((len(training_bugs)/total_len)*100, 2)}%')
print (f'Percentage of GPU Bugs: {round((len(gpu_bugs)/total_len)*100, 2)}%')
print (f'Percentage of API Bugs: {round((len(api_bugs)/total_len)*100, 2)}%')

Total Number of Unique Bugs: 1154
Percentage of Model Bugs: 22.36%
Percentage of Tensor & Input Bugs: 31.98%
Percentage of Training Bugs: 21.49%
Percentage of GPU Bugs: 19.5%
Percentage of API Bugs: 4.68%


In [77]:
# Total: 250 Bugs, derive  number of model bugs, tensor bugs, training bugs and gpu bugs, and api bugs
model_bugs = model_bugs.sample(n=56, random_state=42)
tensor_bugs = tensor_bugs.sample(n=80, random_state=42)
training_bugs = training_bugs.sample(n=54, random_state=42)
gpu_bugs = gpu_bugs.sample(n=49, random_state=42)
api_bugs = api_bugs.sample(n=12, random_state=42)

model_bugs['Type'] = 'model'
tensor_bugs['Type'] = 'tensor'
training_bugs['Type'] = 'training'
gpu_bugs['Type'] = 'gpu'
api_bugs['Type'] = 'api'

# Merge all the dataframes
df = pd.concat([model_bugs, tensor_bugs, training_bugs, gpu_bugs, api_bugs], ignore_index=True)
df = df.drop(['Spaced_Tags'], axis=1)

In [78]:
# Extract tags and count occurrences
tags = [re.findall(r'<([^>]*)>', tag) for tag in df['Tags']]
tag_counts = Counter([tag for sublist in tags for tag in sublist])

# Sort tag counts in descending order
sorted_counts = sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

# Print sorted tag counts
for tag, count in sorted_counts:
    print(f"{tag}: {count}")

python: 146
pytorch: 144
tensor: 80
tensorflow: 68
gpu: 32
keras: 29
deep-learning: 27
machine-learning: 24
loss-function: 20
neural-network: 17
bert-language-model: 17
conv-neural-network: 13
huggingface-transformers: 11
nvidia: 11
nlp: 10
loss: 10
training-data: 10
python-3.x: 9
numpy: 9
cuda: 9
torch: 8
transformer-model: 7
tensorflow2.0: 7
attention-model: 6
keras-layer: 6
lstm: 6
pre-trained-model: 6
tensorflow-model-garden: 5
google-colaboratory: 5
optimization: 5
activation-function: 4
autoencoder: 4
c++: 4
pytorch-lightning: 4
mse: 4
attributeerror: 4
typeerror: 4
relu: 3
matrix: 3
language-model: 3
word-embedding: 3
tf.keras: 3
tensorflow-datasets: 3
tensorflow.js: 3
memory: 3
pytorch-dataloader: 3
computer-vision: 3
data-augmentation: 3
performance: 3
ubuntu: 3
valueerror: 3
multiclass-classification: 2
normalization: 2
model: 2
text-classification: 2
classification: 2
embedding: 2
layer: 2
torchscript: 2
gpt-2: 2
scikit-learn: 2
javascript: 2
generative-adversarial-network: 

In [79]:
df.head()

Unnamed: 0,Links,Title,Tags,Id,Type
0,https://stackoverflow.com/questions/65918888,Mixture parameters from a TensorFlow Probabili...,<python><tensorflow><neural-network><mixture-m...,65918888,model
1,https://stackoverflow.com/questions/63478947,Correct Way to Fine-Tune/Train HuggingFace's M...,<python><pytorch><bert-language-model><hugging...,63478947,model
2,https://stackoverflow.com/questions/67356013,Multiple Activation Functions for multiple Lay...,<tensorflow><neural-network><multiclass-classi...,67356013,model
3,https://stackoverflow.com/questions/65228352,Matrix inverse approximation with keras dense ...,<python><matrix><keras><neural-network><multi-...,65228352,model
4,https://stackoverflow.com/questions/72489570,Getting random output every time on running Ne...,<nlp><pytorch><huggingface-transformers><bert-...,72489570,model


In [80]:
df.to_csv('../Dataset/DL_Bugs.csv', index=False)