<a href="https://colab.research.google.com/github/The237/papers/blob/master/papers_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objet

Contenir tous les codes sources d'expérimentation des implémentations sur le projet d'automatisation de la revue systématique nommé PAPERS.

## Classes definition

In [16]:
class Article:
    """
    The Article class represents an article.

    Attributes:
        title (str): The title of the article.
        abstract (str): The abstract of the article.
        keywords (list, optional): A list of keywords associated with the article. Defaults to an empty list.
        content (str, optional): The content of the article. Defaults to an empty string.
        metadata (object, optional): Additional metadata associated with the article. Defaults to None.

    Methods:
        get_title():
            Returns the title of the article.
        get_abstract():
            Returns the abstract of the article.
        get_keywords():
            Returns the keywords associated with the article.
        get_content():
            Returns the content of the article.

    """

    def __init__(self, title, abstract, keywords=None, content=None, metadata=None):
        """
        Initializes an instance of the Article class.

        Args:
            title (str): The title of the article.
            abstract (str): The abstract of the article.
            keywords (list, optional): A list of keywords associated with the article. Defaults to an empty list.
            content (str, optional): The content of the article. Defaults to an empty string.
            metadata (object, optional): Additional metadata associated with the article. Defaults to None.

        """
        self.title = title
        self.abstract = abstract
        self.keywords = keywords if keywords else []
        self.content = content if content else ''
        self.metadata = metadata if metadata else ''

    def get_title(self):
        """
        Returns the title of the article.

        Returns:
            title (str): The title of the article.

        """
        return self.title

    def get_abstract(self):
        """
        Returns the abstract of the article.

        Returns:
            abstract (str): The abstract of the article.

        """
        return self.abstract

    def get_keywords(self):
        """
        Returns the keywords associated with the article.

        Returns:
            keywords (list): A list of keywords associated with the article.

        """
        return self.keywords

    def get_content(self):
        """
        Returns the content of the article.

        Returns:
            content (str): The content of the article.

        """
        return self.content

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

class ArticleCleaner:
    """
    The ArticleCleaner class represents a cleaner for articles.

    Attributes:
        custom_stopwords (list, optional): A list of custom stopwords to be added. Defaults to None.

    Methods:
        clean_article(article):
            Cleans the given article by removing stopwords and punctuation.

    """

    def __init__(self, custom_stopwords=None):
        """
        Initializes an instance of the ArticleCleaner class.

        Args:
            custom_stopwords (list, optional): A list of custom stopwords to be added. Defaults to None.

        """
        self.custom_stopwords = custom_stopwords if custom_stopwords else []

    def clean_article(self, article):
        """
        Cleans the given article by removing stopwords and punctuation.

        Args:
            article (str): The article to be cleaned.

        Returns:
            cleaned_article (str): The cleaned version of the article.

        """
        # Tokenisation du contenu de l'article en mots individuels
        tokens = word_tokenize(article)

        # Chargement des stopwords anglais
        nltk.download('stopwords')
        english_stopwords = set(stopwords.words('english'))

        # Chargement des éléments de ponctuation
        punctuation = set(string.punctuation)

        # Création de l'ensemble de stopwords comprenant les stopwords anglais et les stopwords spécifiques de l'auteur
        stopwords_set = english_stopwords.union(self.custom_stopwords)

        # Suppression des stopwords et des éléments de ponctuation
        filtered_tokens = [word for word in tokens if word.lower() not in stopwords_set and word not in punctuation]

        # Reconstitution du contenu nettoyé
        cleaned_article = ' '.join(filtered_tokens)

        return cleaned_article

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Doc2Vec
from transformers import BertTokenizer, BertModel
import torch

class ArticleVectorizer:
    """
    The ArticleVectorizer class represents a vectorizer for converting articles into feature vectors.

    Attributes:
        method (str): The vectorization method to use. Valid options are 'tfidf', 'bert', or 'doc2vec'.

    Methods:
        vectorize_article(article):
            Vectorizes the given article based on the specified method.

    """

    def __init__(self, method='tfidf'):
        """
        Initializes an instance of the ArticleVectorizer class.

        Args:
            method (str, optional): The vectorization method to use. Defaults to 'tfidf'.

        """
        self.method = method
        if self.method == 'bert':
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            self.model = BertModel.from_pretrained('bert-base-uncased')
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.model.to(self.device)
            self.model.eval()
        elif self.method == 'doc2vec':
            self.model = Doc2Vec.load('doc2vec_model')  # Charger le modèle Doc2Vec entraîné

    def vectorize_article(self, article):
        """
        Vectorizes the given article based on the specified method.

        Args:
            article (str): The article to be vectorized.

        Returns:
            vectorized_article (object): The vectorized representation of the article.

        """
        if self.method == 'tfidf':
            # Utiliser la méthode TF-IDF
            vectorizer = TfidfVectorizer()
            vectorized_article = vectorizer.fit_transform([article])
        elif self.method == 'bert':
            # Utiliser BERT
            encoding = self.tokenizer.encode_plus(
                article,
                add_special_tokens=True,
                max_length=512,
                truncation=True,
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)
            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)
                vectorized_article = torch.mean(outputs.last_hidden_state, dim=1)
        elif self.method == 'doc2vec':
            # Utiliser Doc2Vec
            vectorized_article = self.model.infer_vector(article.split())

        return vectorized_article

In [13]:
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances
from sklearn.metrics.pairwise import cosine_similarity

class ArticleDistanceCalculator:
    """
    The ArticleDistanceCalculator class represents a calculator for calculating the distance between two articles.

    Attributes:
        None

    Methods:
        calculate_distance(article1, article2, metric='euclidean'):
            Calculates the distance between two articles based on the specified metric.

    """

    def calculate_distance(self, article1, article2, metric='euclidean'):
        """
        Calculates the distance between two articles based on the specified metric.

        Args:
            article1 (str): The first article.
            article2 (str): The second article.
            metric (str, optional): The distance metric to use. Valid options are 'euclidean', 'manhattan',
                'cosine', or 'cosine_similarity'. Defaults to 'euclidean'.

        Returns:
            float: The distance between the two articles.

        Raises:
            ValueError: If an invalid metric is specified.

        """
        # Conversion des articles en vecteurs de caractéristiques
        vectorizer = ArticleVectorizer()
        vectorized_article1 = vectorizer.vectorize_article(article1)
        vectorized_article2 = vectorizer.vectorize_article(article2)

        # Calcul de la distance entre les articles
        if metric == 'euclidean':
            distance = euclidean_distances(vectorized_article1, vectorized_article2)[0][0]
        elif metric == 'manhattan':
            distance = manhattan_distances(vectorized_article1, vectorized_article2)[0][0]
        elif metric == 'cosine':
            distance = cosine_distances(vectorized_article1, vectorized_article2)[0][0]
        elif metric == 'cosine_similarity':
            similarity = cosine_similarity(vectorized_article1, vectorized_article2)[0][0]
            distance = 1 - similarity
        else:
            raise ValueError("Invalid metric. Please specify 'euclidean', 'manhattan', 'cosine', or 'cosine_similarity'.")

        return distance

In [12]:
class ArticleClassifier:
    """
    The ArticleClassifier class represents the classification of an article based on a distance metric and a threshold.

    Attributes:
        None

    Methods:
        classify_article(distance, threshold):
            Classifies the article based on the given distance and threshold.

    """

    def classify_article(self, distance, threshold):
        """
        Classifies the article based on the given distance and threshold.

        Args:
            distance (float): The distance between the article and a reference.
            threshold (float): The threshold value for classification.

        Returns:
            str: The classification of the article. It can be "Classe A" if the distance is less than or equal to
            the threshold, or "Classe B" otherwise.

        """
        if distance <= threshold:
            classification = "Classe A"
        else:
            classification = "Classe B"

        return classification

## Classes Test