# Modules and Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy.stats import randint

In [2]:
# Load datasets
trainset = pd.read_csv('incidents_labelled.csv', index_col=0)
testset = pd.read_csv('incidents_val.csv', index_col=0)

# RandomSearchCV for mlp

In [24]:
text_clf_mlp = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', MLPClassifier(random_state=42, max_iter=500))  # MLPClassifier with no hyperparameters set yet
])

In [None]:
# Parameter distribution for RandomizedSearchCV
param_dist_mlp = {
    'clf__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  # Different configurations of hidden layers
    'clf__activation': ['tanh', 'relu'],  # Activation functions to explore
    'clf__solver': ['adam', 'sgd'],  # Solvers to explore
    'clf__alpha': uniform(0.0001, 0.01),  # Regularization parameter
    'clf__learning_rate': ['constant', 'adaptive'],  # Learning rate strategies
    'clf__learning_rate_init': uniform(0.0001, 0.1)  # Initial learning rate
}

In [None]:
def train_and_evaluate_mlp(label):
    # Split the trainset into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(trainset['title'], trainset[label], test_size=0.2, random_state=42)

    # RandomizedSearchCV instance for hyperparameter tuning
    random_search_mlp = RandomizedSearchCV(text_clf_mlp, param_distributions=param_dist_mlp, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring='f1_macro')

    # Fit the model on the training data with hyperparameter tuning
    random_search_mlp.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = random_search_mlp.predict(X_test)

    # Print best parameters and classification report
    print(f"Best parameters for {label}: {random_search_mlp.best_params_}\n")
    print(f"Classification report for {label}:\n")
    print(classification_report(y_test, y_pred))


In [None]:
# Evaluate model for each label
for label in ('hazard-category', 'product-category'):
    train_and_evaluate_mlp(label)

# RandomSearchCV of RF

In [22]:
text_clf_rf = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42)),  # RandomForest with 100 trees
])

In [None]:
# Parameter distribution for RandomizedSearchCV
param_dist = {
    'clf__n_estimators': randint(100, 500),  # Try a range between 100 and 500 trees
    'clf__max_depth': [None, 10, 20, 30, 40],  # Explore different tree depths
    'clf__min_samples_split': randint(2, 10),  # Minimum samples to split a node
    'clf__min_samples_leaf': randint(1, 10),  # Minimum samples for a leaf node
    'clf__bootstrap': [True, False]  # Whether bootstrap sampling is used
}

In [21]:
def train_and_evaluate(label):
    # Split the trainset into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(trainset['title'], trainset[label], test_size=0.2, random_state=42)

    # RandomizedSearchCV instance for hyperparameter tuning
    random_search = RandomizedSearchCV(text_clf_rf, param_distributions=param_dist, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring='f1_macro')

    # Fit the model on the training data with hyperparameter tuning
    random_search.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = random_search.predict(X_test)

    # Print best parameters and classification report
    print(f"Best parameters for {label}: {random_search.best_params_}\n")
    print(f"Classification report for {label}:\n")
    print(classification_report(y_test, y_pred))

In [None]:
# Evaluate model for each label
for label in ('hazard', 'product'):
    train_and_evaluate(label)

# downloading predictions and saving it

In [None]:
for label in ('hazard-category', 'product-category'):
    text_clf_nn.fit(trainset['title'], trainset[label])
    predictions[label] = text_clf_nn.predict(testset['title'])

In [None]:
predictions = pd.DataFrame()
for label in ('hazard', 'product'):
    text_clf_rf.fit(trainset['title'], trainset[label])
    predictions[label] = text_clf_rf.predict(testset['title'])

In [None]:
import os
from shutil import make_archive

output_folder = r'C:\Users\Sushovit\Desktop\RF'
os.makedirs(output_folder, exist_ok=True)

# Save the predictions to a CSV file inside the "SemEval-Hazard" folder
csv_path = os.path.join(output_folder, 'submission.csv')
predictions.to_csv(csv_path)

# Zip the folder and save the zip file inside the "SemEval-Hazard" folder
zip_path = os.path.join(output_folder, 'submission')
make_archive(zip_path, 'zip', output_folder)

print("Predictions saved and zipped successfully in SemEval-Hazard folder.")

Predictions saved and zipped successfully in SemEval-Hazard folder.
