In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Define preprocess_text function if not already defined
def preprocess_text(text):
    # Placeholder preprocessing function, replace with your actual preprocessing logic
    return text.lower()

# Load training data
train_data = pd.read_csv('bugs-train.csv')
test1_data = pd.read_csv('bugs-test.csv')

# Apply preprocessing
train_data['summary_clean'] = train_data['summary'].apply(preprocess_text)
test1_data['summary_clean'] = test1_data['summary'].apply(preprocess_text)

# Vectorize the text data
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(train_data['summary_clean'])
X_test1_tfidf = tfidf.transform(test1_data['summary_clean'])

# Apply NearMiss undersampling to the majority class
nm1 = NearMiss(version=1)
X_undersampled, y_undersampled = nm1.fit_resample(X_train_tfidf, train_data['severity'])

# Apply SMOTE oversampling to the undersampled data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_undersampled, y_undersampled)

# Divide the resampled data into train and validation sets
X_train_resampled, X_validation, y_train_resampled, y_validation = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

# Train SVM model with the best parameters
best_svm = SVC(C=1, gamma=0.01, kernel='linear', probability=True)
best_svm.fit(X_train_resampled, y_train_resampled)

# Predict severity labels for validation data
y_pred_validation = best_svm.predict(X_validation)

# Evaluate the model on validation data
precision = precision_score(y_validation, y_pred_validation, average='weighted')
recall = recall_score(y_validation, y_pred_validation, average='weighted')
f1 = f1_score(y_validation, y_pred_validation, average='weighted')
accuracy = accuracy_score(y_validation, y_pred_validation)

# Print evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Accuracy:", accuracy)

# Predict severity labels for test data
test_predictions = best_svm.predict(X_test1_tfidf)

# Create a DataFrame with bug_id and predicted severity
final_predictions = pd.DataFrame({
    'bug_id': test1_data['bug_id'],
    'severity': test_predictions
})

# Save the final predictions to a CSV file
final_predictions.to_csv('final_predictions.csv', index=False)