# Modules and Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy.stats import randint

In [2]:
# Load datasets
trainset = pd.read_csv('incidents_labelled.csv', index_col=0)
testset = pd.read_csv('incidents_val.csv', index_col=0)

# CrossVal of RF and NN

In [22]:
text_clf_rf = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42)),  # RandomForest with 100 trees
])

text_clf_nn = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
])

In [21]:
def train_and_evaluate_with_cv(model, label, X, y):
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=500, scoring='f1_macro')

    print(f"\nCross-validation results for {label}:")
    print(f"Mean : {cv_scores.mean():.4f}")
    print(f"Standard deviation: {cv_scores.std():.4f}")

    # Train model on the entire dataset after cross-validation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = model.predict(X_test)

    # Print classification report
    print(f"\nClassification report for {label}:\n")
    print(classification_report(y_test, y_pred))

In [None]:
# Evaluate the Neural Network model for each label using cross-validation
print("Evaluating Neural Network model for each label")
for label in ['hazard-category', 'product-category']:
    train_and_evaluate_with_cv(text_clf_nn, label, trainset['title'], trainset[label])

In [None]:
# Evaluate the RandomForest model for each label using cross-validation
print("Evaluating RandomForest model for each label")
for label in ['hazard', 'product']:
    train_and_evaluate_with_cv(text_clf_rf, label, trainset['title'], trainset[label])


# downloading predictions and saving it

In [None]:
for label in ('hazard-category', 'product-category'):
    text_clf_nn.fit(trainset['title'], trainset[label])
    predictions[label] = text_clf_nn.predict(testset['title'])

In [None]:
predictions = pd.DataFrame()
for label in ('hazard', 'product'):
    text_clf_rf.fit(trainset['title'], trainset[label])
    predictions[label] = text_clf_rf.predict(testset['title'])

In [None]:
import os
from shutil import make_archive

output_folder = r'C:\Users\Sushovit\Desktop\RF'
os.makedirs(output_folder, exist_ok=True)

# Save the predictions to a CSV file inside the "SemEval-Hazard" folder
csv_path = os.path.join(output_folder, 'submission.csv')
predictions.to_csv(csv_path)

# Zip the folder and save the zip file inside the "SemEval-Hazard" folder
zip_path = os.path.join(output_folder, 'submission')
make_archive(zip_path, 'zip', output_folder)

print("Predictions saved and zipped successfully in SemEval-Hazard folder.")

Predictions saved and zipped successfully in SemEval-Hazard folder.
