# Arabic Multilabel Text Classification Notebook

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# Read the dataset
file_path = 'path_to_your_file/data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# The dataset may undergo several preprocessing steps:
# - Cleaning: Removing any unnecessary characters, punctuation, or special symbols.
# - Tokenization: Breaking down the text into individual words or tokens.
# - Normalization: Converting text to lowercase to ensure consistency in the text data.
# - Removing stop words: Eliminating common words that do not carry much information.
# - Stemming or Lemmatization: Reducing words to their base or root form to reduce complexity.

# Vectorization and feature extraction
# TF-IDF (Term Frequency-Inverse Document Frequency) is used to convert text data into numerical feature vectors.
# This transformation allows machine learning algorithms to work with text data effectively.
tfidf = TfidfVectorizer(analyzer='word', max_features=1000, ngram_range=(1,3))
X = tfidf.fit_transform(df['Articles'])

# Multi-label binarization
# Convert the multi-labels into a binary format suitable for multi-label classification tasks.
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['Labels'])

# Splitting the dataset
# Split the dataset into training and testing sets to evaluate model performance.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Machine learning models
# Initialize machine learning models for classification
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

# Evaluation metrics
# Define evaluation metrics to assess the performance of the classification models.
def j_score(y_true, y_pred):
    jaccard = np.minimum(y_true, y_pred).sum(axis=1) / np.maximum(y_true, y_pred).sum(axis=1)
    return jaccard.mean() * 100

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Jaccard score: {}".format(j_score(y_test, y_pred)))
    print('----')


# Training and evaluation
# Train the models using the training data and evaluate their performance on the test data.
for classifier in [sgd, lr, svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

# The output was:
# Clf:  SGDClassifier
# Jaccard score: 74.26%
# ----
# Clf:  LogisticRegression
# Jaccard score: 56.44%
# ----
# Clf:  LinearSVC
# Jaccard score: 75.74%
# ----

# Additional evaluation and reporting
# Provide additional insights into the model predictions and ground truth labels.
for classifier in [sgd]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"pred: {multilabel.inverse_transform(y_pred)[1:20]}\n")
    print(f"real: {multilabel.inverse_transform(y_test)[1:20]}\n")
    print_score(y_pred, classifier)


# Classification report
labels = ['label1', 'label2', 'label3']  # Replace with your actual labels
print(classification_report(y_test, y_pred, target_names=labels))
