In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

# Load the training and test data
train_data = pd.read_csv('bugs-train.csv')
test_data = pd.read_csv('bugs-test.csv')

# Preprocess the data
X_train = train_data['summary']
y_train = train_data['severity']
X_test = test_data['summary']

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Split the training data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train_encoded, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Transform the training and validation data
X_train_tfidf = tfidf.fit_transform(X_train_split)
X_val_tfidf = tfidf.transform(X_val_split)
X_test_tfidf = tfidf.transform(X_test)

# Define base models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Define the stacking classifier
estimators = [
    ('log_reg', log_reg),
    ('rf', rf),
    ('gb', gb)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)

# Train the stacking classifier
stacking_clf.fit(X_train_tfidf, y_train_split)

# Validate the model
y_val_pred = stacking_clf.predict(X_val_tfidf)
print(classification_report(y_val_split, y_val_pred, target_names=label_encoder.classes_))

# Train the stacking classifier on the full training data
X_train_full_tfidf = tfidf.fit_transform(X_train)
stacking_clf.fit(X_train_full_tfidf, y_train_encoded)

# Predict the severity of the bugs in the test data
y_test_pred_encoded = []

# Initialize tqdm for progress bar
with tqdm(total=len(X_test), desc="Processing", mininterval=0.1) as progress_bar:
    for text in X_test:
        encoded_pred = stacking_clf.predict([text])
        y_test_pred_encoded.append(encoded_pred[0])
        progress_bar.update(1)  # Update progress bar

# Decode the predictions
y_test_pred = label_encoder.inverse_transform(y_test_pred_encoded)

# Add the predictions to the test data
test_data['severity'] = y_test_pred

# Delete the "summary" column
test_data.drop(columns=['summary'], inplace=True)

# Save the predictions to a new CSV file
test_data.to_csv('bugs-test-predictions_mlp.csv', index=False)

print(test_data)