In [59]:
import tensorflow as tf

# Limit GPU memory usage
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only allocate a specific amount of memory on the first GPU
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]
        )
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)


1 Physical GPUs, 1 Logical GPUs


In [60]:
import pandas as pd

In [61]:
def check_NA(train_DF, test_DF):
    print(train_DF.isnull().sum())
    print(test_DF.isnull().sum())

def class_test_distribution(train_DF):
    print(train_DF['severity'].value_counts())

def main():
    train_DF = pd.read_csv('bugs-train.csv')
    test_DF = pd.read_csv('bugs-test.csv')
    check_NA(train_DF, test_DF)
    class_test_distribution(train_DF)

if __name__ == '__main__':
    main()

bug_id      0
summary     0
severity    0
dtype: int64
bug_id     0
summary    0
dtype: int64
severity
normal         125854
critical        18658
major            6053
enhancement      4426
minor            3102
trivial          1204
blocker           701
Name: count, dtype: int64


In [4]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kagan_ntaijui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kagan_ntaijui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kagan_ntaijui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [76]:
def clean_summary(summary):
    summary = summary.lower()
    summary = re.sub(r'\[@.*?\]', '', summary)
    summary = re.sub(r'[^\w\s]', '', summary)
    summary = nltk.word_tokenize(summary)
    summary = [word for word in summary if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    summary = [lemmatizer.lemmatize(word) for word in summary]
    summary = ' '.join(summary)
    return summary

train_df = pd.read_csv('bugs-train.csv')
train_df['summary'] = train_df['summary'].apply(clean_summary)

label_mapping = {
    'normal': 0,
    'critical': 1,
    'major': 2,
    'enhancement': 3,
    'minor': 4,
    'trivial': 5,
    'blocker': 6
}

# Map labels to numerical values
train_df['severity'] = train_df['severity'].map(label_mapping)

train_df.head()

Unnamed: 0,bug_id,summary,severity
0,365569,remove workaround bug 297227,0
1,365578,print preview crash url gtk2 build,1
2,365582,line showing table,2
3,365584,firefox render ûïsimplified arabicû font face ...,0
4,365597,crash,1


In [71]:
def clean_summary_more(summary):
    summary = re.sub(r'\d+', '', summary)
    summary = ' '.join([word for word in summary.split() if len(word) > 2])
    return summary

# Apply the additional cleaning function to the 'summary' column
train_df['summary'] = train_df['summary'].apply(clean_summary_more)
train_df.head(-5)

Unnamed: 0,bug_id,summary,severity
0,365569,remove workaround bug,0
1,365578,print preview crash url build,1
2,365582,line showing table,2
3,365584,firefox render ûïsimplified arabicû font face ...,0
4,365597,crash,1
...,...,...,...
159988,1143339,mac crash second closing youtube tab,1
159989,1143343,audio play using createmediaelementsource cors,0
159990,1143349,crash nsinodegetparent,1
159991,1143352,ajax xmlhttprequest post max,0


In [77]:
# Calculate the weight of each severity class
class_weights = train_df['severity'].value_counts(normalize=True)
class_weights = 1 / class_weights
class_weights = class_weights / class_weights.sum()
class_weights = class_weights.to_dict()
print(class_weights)

{0: 0.002620481829584843, 1: 0.017675963135414884, 2: 0.05448506859087574, 3: 0.07451380934942857, 4: 0.10631789818844967, 5: 0.2739187044689127, 6: 0.4704680744373336}


In [78]:
import keras

In [79]:
train_df.head()

Unnamed: 0,bug_id,summary,severity
0,365569,remove workaround bug 297227,0
1,365578,print preview crash url gtk2 build,1
2,365582,line showing table,2
3,365584,firefox render ûïsimplified arabicû font face ...,0
4,365597,crash,1


In [89]:
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text data
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=64, return_tensors="np")  # Reduced max_length

# Tokenize and encode the summaries
encodings = tokenize_function(train_df['summary'].tolist())

# Convert to numpy arrays
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
labels = train_df['severity'].values

# Splitting the data into training and validation sets
input_ids_train, input_ids_val, attention_mask_train, attention_mask_val, y_train, y_val = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42, stratify=labels
)

# Convert labels to tensors
y_train = tf.convert_to_tensor(y_train)
y_val = tf.convert_to_tensor(y_val)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids_train, 'attention_mask': attention_mask_train}, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids_val, 'attention_mask': attention_mask_val}, y_val))

# Batch and shuffle datasets
batch_size = 16  # Reduced batch size
train_dataset = train_dataset.shuffle(len(input_ids_train)).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

In [None]:
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertConfig
from tensorflow.keras import mixed_precision
from tensorflow.keras.callbacks import EarlyStopping

# Custom train step to handle the unpacking issue
class CustomTFDistilBertForSequenceClassification(TFDistilBertForSequenceClassification):
    def train_step(self, data):
        x, y = data
        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
            scaled_loss = self.optimizer.get_scaled_loss(loss)
        
        scaled_gradients = tape.gradient(scaled_loss, self.trainable_variables)
        gradients = self.optimizer.get_unscaled_gradients(scaled_gradients)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        self.compiled_metrics.update_state(y, y_pred)
        return {m.name: m.result() for m in self.metrics}

    def test_step(self, data):
        x, y = data
        y_pred = self(x, training=False)  # Forward pass
        loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        self.compiled_metrics.update_state(y, y_pred)
        return {m.name: m.result() for m in self.metrics}

# Define the configuration with the dropout rate
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=len(train_df['severity'].unique()), dropout=0.5)

# Instantiate the custom model with the specified configuration
model = CustomTFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

# Compile the model with mixed precision policy
optimizer = mixed_precision.LossScaleOptimizer(tf.keras.optimizers.Adam(learning_rate=1e-5))
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# Train the model
model.fit(train_dataset, epochs=4, validation_data=val_dataset, callbacks=[early_stopping], class_weight=class_weights)

# Save the model
model.save_pretrained('last_chance_distilbert_model')


In [53]:
# Save the trained model
model.save('my_distilbert_model_final_v2')

























INFO:tensorflow:Assets written to: my_distilbert_model_final\assets


INFO:tensorflow:Assets written to: my_distilbert_model_final\assets


In [56]:
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pandas as pd

# Load the test data
test_df = pd.read_csv('bugs-test.csv')
test_df['summary'] = test_df['summary'].apply(clean_summary)
test_df['summary'] = test_df['summary'].apply(clean_summary_more)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to preprocess inference data
def preprocess_inference_data(texts):
    encodings = tokenizer(texts, padding='max_length', truncation=True, max_length=128, return_tensors="tf")
    return encodings['input_ids'], encodings['attention_mask']

# Preprocess the test data
test_input_ids, test_attention_mask = preprocess_inference_data(test_df['summary'].tolist())

# Make predictions
predictions = model.predict({'input_ids': test_input_ids, 'attention_mask': test_attention_mask})
predicted_classes = tf.argmax(predictions.logits, axis=1).numpy()

# Assuming you have the same label mapping as during training
label_mapping = {
    0: 'normal',
    1: 'critical',
    2: 'major',
    3: 'enhancement',
    4: 'minor',
    5: 'trivial',
    6: 'blocker'
}

predicted_labels = [label_mapping[pred] for pred in predicted_classes]

# Create a DataFrame with the predicted severity classes, and the bug id
output_df = pd.DataFrame({'bug_id': test_df['bug_id'], 'severity': predicted_labels})

# Save the predictions to a CSV file
output_df.to_csv('predicted_severity_final.csv', index=False)
print("Predictions saved to 'predicted_severity.csv'.")


Predictions saved to 'predicted_severity.csv'.
