### 3rd assignment-ST2

This is an implementation of Random Forest in order to predict the hazard and product based on the title

In [None]:
import torch
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Load and preprocess data
df_train = pd.read_csv("Data/incidents_train.csv").dropna(subset=["title", "product", "hazard"])
df_valid = pd.read_csv("Data/incidents_valid.csv").dropna(subset=["title", "product", "hazard"])

# Prepare TF-IDF vectorizer for the titles
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features (terms)

# Fit the vectorizer on the training data (for title column)
X_train = vectorizer.fit_transform(df_train['title'])
X_valid = vectorizer.transform(df_valid['title'])

# Prepare target variables for both products and hazards
y_train_product = df_train['product']
y_train_hazard = df_train['hazard']
y_valid_product = df_valid['product']
y_valid_hazard = df_valid['hazard']

# Train Random Forest classifiers for both product and hazard prediction
# For product classification
product_rf = RandomForestClassifier(n_estimators=100, random_state=42)
product_rf.fit(X_train, y_train_product)

# For hazard classification
hazard_rf = RandomForestClassifier(n_estimators=100, random_state=42)
hazard_rf.fit(X_train, y_train_hazard)

# Predict using both models
y_pred_product = product_rf.predict(X_valid)
y_pred_hazard = hazard_rf.predict(X_valid)

# Compute accuracy and F1 score for both product and hazard
product_accuracy = accuracy_score(y_valid_product, y_pred_product)
hazard_accuracy = accuracy_score(y_valid_hazard, y_pred_hazard)

f1_product = f1_score(y_valid_product, y_pred_product, average='macro')
f1_hazard = f1_score(y_valid_hazard, y_pred_hazard, average='macro')

# Display the results
print(f"Product Accuracy: {product_accuracy:.4f}")
print(f"Hazard Accuracy: {hazard_accuracy:.4f}")
print(f"Product F1 Score: {f1_product:.4f}")
print(f"Hazard F1 Score: {f1_hazard:.4f}")

# If you want a combined score for both, you can compute it like this:
combined_f1 = (f1_product + f1_hazard) / 2
print(f"Combined F1 Score: {combined_f1:.4f}")

In [None]:
df_test = pd.read_csv("Data/incidents_test.csv").dropna(subset=["title", "product", "hazard"])

# Transform test data using the same vectorizer
X_test = vectorizer.transform(df_test['title'])

# Prepare test target variables
y_test_product = df_test['product']
y_test_hazard = df_test['hazard']

# Make predictions on the test set
y_pred_product_test = product_rf.predict(X_test)
y_pred_hazard_test = hazard_rf.predict(X_test)

# Compute accuracy and F1 score for both product and hazard
product_accuracy = accuracy_score(y_test_product, y_pred_product_test)
hazard_accuracy = accuracy_score(y_test_hazard, y_pred_hazard_test)

f1_product = f1_score(y_test_product, y_pred_product_test, average='macro')
f1_hazard = f1_score(y_test_hazard, y_pred_hazard_test, average='macro')

# Print final evaluation results
print("\n--- Final Test Set Evaluation ---")
print(f"Product Accuracy: {product_accuracy:.4f}")
print(f"Hazard Accuracy: {hazard_accuracy:.4f}")
print(f"Product F1 Score: {f1_product:.4f}")
print(f"Hazard F1 Score: {f1_hazard:.4f}")

# Compute combined F1 score for overall performance
combined_f1 = (f1_product + f1_hazard) / 2
print(f"Combined F1 Score: {combined_f1:.4f}")

In [None]:
# Create a DataFrame to store test predictions and evaluation metrics
df_predictions = pd.DataFrame({
    "Title": df_test["title"],
    "Predicted_Product": y_pred_product_test,
    "Predicted_Hazard": y_pred_hazard_test
})

# Save predictions to a CSV file
df_predictions.to_csv("submission_st2.csv", index=False)
print("Test predictions saved to 'submission_st2.csv'.")
