In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Loading and Pre-processing the Data

In [5]:
# Loading dataset
df = pd.read_csv('/content/TRAIN DATASET (1).csv')


# 1. Data Preprocessing
def clean_text(text):
    if isinstance(text, str):
        # Removing BOM if present
        text = text.lstrip('\ufeff')

        # Fixing encoding issues by ignoring problematic characters
        text = text.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')

        # Removing special characters (non-alphanumeric characters) except for Spanish letters
        text = re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑ0-9\s]', '', text)

        # Converting to lowercase and strip whitespace
        text = text.lower().strip()
    return text

# Applying the cleaning function to the DataFrame
df['text'] = df['text'].apply(clean_text)

df.head()

Unnamed: 0,id,text,labels
0,train_0,presentamos el caso de un paciente de 44 aos c...,[0 0 0 0]
1,train_1,se describe el caso clnico de un escolar del s...,[0 0 1 0]
2,train_2,un hombre de 36 aos lleg al servicio de urgenc...,[0 0 0 0]
3,train_3,mujer de 21 aos natural de la india residente ...,[0 0 0 0]
4,train_4,presentamos el caso de una paciente mujer de 1...,[0 0 0 0]


## Vectorizing the *Data*

In [3]:
# Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)  # Using the top 5000 features
X = vectorizer.fit_transform(df['text'])

## Fixing Label's format

In [4]:
def fix_label_format(label):
    # Ensuring the label is treated as a string and replace spaces with commas
    label_str = str(label)
    label_str = label_str.replace(' ', ',').replace('[,', '[').replace(',]', ']')
    return label_str

# Applying the function to the labels column
df['labels'] = df['labels'].apply(fix_label_format)

## Applying Label Encoding on Data

In [6]:
# Initializing the label encoder
encoder = LabelEncoder()

# Fitting and transform the label encoder on the labels_str column
df['labels'] = encoder.fit_transform(df['labels'])

In [7]:
df.head()

Unnamed: 0,id,text,labels
0,train_0,presentamos el caso de un paciente de 44 aos c...,0
1,train_1,se describe el caso clnico de un escolar del s...,2
2,train_2,un hombre de 36 aos lleg al servicio de urgenc...,0
3,train_3,mujer de 21 aos natural de la india residente ...,0
4,train_4,presentamos el caso de una paciente mujer de 1...,0


## Splitting the Data

In [18]:
# Initializing StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterating over the splits
for train_index, test_index in kf.split(X, y):
    # Using the indices directly for slicing sparse matrices
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]



## Applying Model and Making Predictions

In [31]:
rf_model = RandomForestClassifier()

# Training the model
rf_model.fit(X_train, y_train)

# Making predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluating the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'MultinomialNB Accuracy: {accuracy_rf}')

MultinomialNB Accuracy: 0.77


## Evaluation Metrics

In [32]:
# 7. Evaluation Metrics
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Hamming Loss:", hamming_loss(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.77      1.00      0.87        75
           1       0.00      0.00      0.00         5
           2       1.00      0.20      0.33        10
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.77       100
   macro avg       0.22      0.15      0.15       100
weighted avg       0.67      0.77      0.68       100

Hamming Loss: 0.25


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Making Predictions on Test data

In [34]:
# Loading the test dataset
test_df = pd.read_csv('/content/TEST DATASET (1).csv')

# Applying the text cleaning function to the test dataset
test_df['text'] = test_df['text'].apply(clean_text)

# Vectorizing the test data using the same TF-IDF vectorizer
X_test_data = vectorizer.transform(test_df['text'])

# Making predictions on the test data
y_pred_test = rf_model.predict(X_test_data)

# If the model outputs labels encoded as integers, convert them back to the original labels
y_pred_test_labels = encoder.inverse_transform(y_pred_test)

# Optionally, if you want to see the predicted labels for each instance
test_df['predicted_labels'] = y_pred_test_labels

# Displaying the predictions
print(test_df[['text', 'predicted_labels']].head())

                                                text predicted_labels
0  mujer de trece aos que presenta un cuadro de f...        [0 0 0 0]
1  motivo de consulta\r\ngonalgia bilateral y rig...        [0 0 0 0]
2  anamnesis\r\nvarn de 54 aos exfumador hiperten...        [0 0 0 0]
3  varn de 70 aos acude con fiebre de origen desc...        [0 0 0 0]
4  varn de 86 aos que acude a urgencias por deter...        [0 0 0 0]


In [36]:
# Adding the predicted labels to the test dataframe in a new column 'pred'
test_df['pred'] = y_pred_test_labels

# Saving the updated dataframe to a new CSV file
test_df.to_csv('TEST_DATASET_with_predictions.csv', index=False)

print("Predictions saved successfully to TEST_DATASET_with_predictions.csv")

Predictions saved successfully to TEST_DATASET_with_predictions.csv
