In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import gensim.downloader as api


In [3]:
# Load the severity training and testing datasets
train_df = pd.read_csv('/kaggle/input/bugzilla-bug-reports/sev_train.csv')
test_df = pd.read_csv('/kaggle/input/bugzilla-bug-reports/sev_test.csv')

# Preview the data
print("Training Data:")
print(train_df.head())

print("Testing Data:")
print(test_df.head())


Training Data:
   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                         Description Severity  Label  
0   Created by Darrell Kindred  dkindred mozilla ...    minor      0  
1   Created by Darrell Kindred  dkindred mozilla ...   normal      0  
2   Created by    msuencks marcant de  on Tuesday...   normal      0  
3   Created by Darrell Kindred  dkindred mozilla ...    major      1  
4   Created by Darrell Kindred  dkindred mozilla ...  trivial      0  
Testing Data:
   Unnamed: 0.1  Unnamed: 0  \
0         28001       28507   
1         28002       28508   
2         28003       28509   
3         28004       28510   
4         28005       28511   

                                         Description Severity  Label  
0   When prompted by authentication dialog box fo...    major      1  
1   Hyatt and I talked about 

In [4]:
# Select relevant columns for training and testing
train_df = train_df[['Description', 'Severity', 'Label']]
test_df = test_df[['Description', 'Severity', 'Label']]

# Preprocess the text data
def preprocess_text(text):
    return ''.join(char.lower() for char in text if char.isalnum() or char.isspace())

train_df['Description'] = train_df['Description'].apply(preprocess_text)
test_df['Description'] = test_df['Description'].apply(preprocess_text)

# Features (X) and target (y)
X_train = train_df['Description']
y_train = train_df['Label']

X_test = test_df['Description']
y_test = test_df['Label']


In [5]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

print("TF-IDF Train Shape:", X_train_tfidf.shape)
print("TF-IDF Test Shape:", X_test_tfidf.shape)


TF-IDF Train Shape: (28000, 5000)
TF-IDF Test Shape: (4427, 5000)


In [7]:
# Load pretrained embeddings
embedding_path = '/kaggle/input/bugzilla-bug-reports/embedding.npy'
vocab_path = '/kaggle/input/bugzilla-bug-reports/vocab.lst'

embeddings = np.load(embedding_path)
with open(vocab_path, 'r') as f:
    vocab = f.read().splitlines()

word_to_index = {word: idx for idx, word in enumerate(vocab)}

def tfidf_weighted_word2vec(tfidf_matrix, feature_names, texts):
    feature_index = {word: i for i, word in enumerate(feature_names)}  # Create a lookup dictionary
    vectorized_texts = []
    
    for idx, text in enumerate(texts):
        words = text.split()
        text_vec = np.zeros(embeddings.shape[1])  # Embedding dimension
        weight_sum = 0
        
        for word in words:
            if word in word_to_index:
                word_idx = word_to_index[word]
                tfidf_weight = tfidf_matrix[idx, feature_index.get(word, 0)]
                text_vec += tfidf_weight * embeddings[word_idx]
                weight_sum += tfidf_weight
        
        if weight_sum != 0:
            text_vec /= weight_sum
        vectorized_texts.append(text_vec)
    
    return np.array(vectorized_texts)


feature_names = tfidf_vectorizer.get_feature_names_out()
X_train_w2v = tfidf_weighted_word2vec(X_train_tfidf, feature_names, X_train)
X_test_w2v = tfidf_weighted_word2vec(X_test_tfidf, feature_names, X_test)

print("TF-IDF Weighted Word2Vec Train Shape:", X_train_w2v.shape)
print("TF-IDF Weighted Word2Vec Test Shape:", X_test_w2v.shape)


TF-IDF Weighted Word2Vec Train Shape: (28000, 100)
TF-IDF Weighted Word2Vec Test Shape: (4427, 100)


In [9]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Combine TF-IDF and Word2Vec features
X_train_combined = np.hstack([X_train_tfidf, X_train_w2v])
X_test_combined = np.hstack([X_test_tfidf, X_test_w2v])

# Initialize XGBoost Classifier
xgb_model = XGBClassifier(
    objective='multi:softmax',  # For multi-class classification
    num_class=len(np.unique(y_train)),  # Number of classes
    max_depth=6,  # Depth of the trees
    learning_rate=0.1,  # Learning rate
    n_estimators=100,  # Number of boosting rounds (trees)
    random_state=42
)

# Train the model
xgb_model.fit(X_train_combined, y_train)

# Predict on the test data
y_pred = xgb_model.predict(X_test_combined)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8210978088999322

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.89      3419
           1       0.75      0.32      0.45      1008

    accuracy                           0.82      4427
   macro avg       0.79      0.64      0.67      4427
weighted avg       0.81      0.82      0.79      4427


Confusion Matrix:
 [[3315  104]
 [ 688  320]]
