**COURSE: PRDL/MLLB**

**PROJECT: Deep Learning**

**TEACHER: Luis Hernández Gómez**

**AUTHORS: MARONE Mamadou / RACHIDI Inass**

**NOTEBOOK: CUSTOM MODEL**

# SETUP

## INSTALLING MODULES

In [1]:
%%capture
!pip install tensorflow
!pip install tqdm

## IMPORTING LIBRARIES

In [2]:
import numpy as np
import pandas as pd
import os
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, BertTokenizer, BertModel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import pipeline
from transformers import T5Tokenizer, TFAutoModelForSeq2SeqLM
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sns

#  Load and Prepare Data

In [None]:
os.chdir(r"C:\Users\maron\OneDrive\02-Documents\00.ETUDES\00.ECOLE_D_INGE\00.CYCLE_ING_FORMATION_INIT\00.3EME_ANNEE_INIT\00.A_COURS\00.PRDL\06.PROJECTS")

In [6]:
df_cleaned = pd.read_csv("DATA\\archive\\news-article-categories.csv")
df_cleaned.head(3)

Unnamed: 0,category,title,body
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis..."
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...


## Select just a sample of the data to know have idea about the behaviour of the model during training

In [7]:
unique_categories = df_cleaned['category'].unique()

# Print the unique categories
print("All Unique Categories:")
for category in unique_categories:
    print(category)

All Unique Categories:
ARTS & CULTURE
BUSINESS
COMEDY
CRIME
EDUCATION
ENTERTAINMENT
ENVIRONMENT
MEDIA
POLITICS
RELIGION
SCIENCE
SPORTS
TECH
WOMEN


In [8]:
selected_categories = ['ENTERTAINMENT', 'ENVIRONMENT', 'ARTS & CULTURE']

# Specify the number of rows you want for each category
rows_per_category = 20  # Adjust this number as needed

# Filter the DataFrame based on selected categories
df_cleaned = df_cleaned[df_cleaned['category'].isin(selected_categories)]

# Extract a specified number of rows for each category
result_df = pd.DataFrame()
for category in selected_categories:
    category_subset = df_cleaned[df_cleaned['category'] == category].head(rows_per_category)
    result_df = pd.concat([result_df, category_subset], ignore_index=True)

## Preparing data

In [9]:
# Encode the labels
label_encoder = LabelEncoder()
df_cleaned['encoded_labels'] = label_encoder.fit_transform(df_cleaned['category'])

In [None]:
# Split the data into features and labels
X = df_cleaned['body']
y = df_cleaned['encoded_labels']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Output layer
num_classes = len(label_encoder.classes_)

# MODEL

In [19]:
max_chunk_length = 512

class CustomClassifier(BaseEstimator, TransformerMixin):
    def __init__(self, max_words, max_length,  embedding_dim, hidden_layer_size):
        self.max_words = max_words
        self.max_length = max_length
        self.embedding_dim = embedding_dim
        self.hidden_layer_size = hidden_layer_size
        self.tokenizer = Tokenizer(num_words = max_words, oov_token = '<OOV>')
        self.model = self._build_model()
        self.summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", revision="a4f8f3e")
        self.max_chunk_length = 512

    def _build_model(self):
        model = Sequential()
        model.add(Embedding(input_dim=self.max_words, output_dim=self.embedding_dim, input_length=self.max_length))
        model.add(Flatten())
        model.add(Dense(self.hidden_layer_size, activation='relu'))
        model.add(Dense(num_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
    
    def _count_words(self, text):
        words = text.split()
        return len(words)

    def _summarize(self, text):
        # Check if the text is empty or None
        if not text or not isinstance(text, str):
            return ""

        # Split the input text into smaller chunks to avoid exceeding the model's maximum sequence length
        text_chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]

        # Summarize each chunk separately
        summaries = []
        for chunk in text_chunks:
            # Calculate the maximum length based on 70% of words or a maximum of 100 words
            max_length = min(int(self._count_words(chunk) * 0.7), 100)
            min_length = min(int(max_length * 0.5), 50)
            
            try:
                summary = self.summarizer(chunk, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
                summaries.append(summary[0]['summary_text'])
            except Exception as e:
                print(f"Error summarizing chunk: {e}")
                summaries.append("")  # Append an empty string if summarization fails

        # Combine the summaries into a single summary
        combined_summary = ' '.join(summaries)

        return combined_summary



    def _clean_text(self, text):
        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        text = text.split()
        text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
        text = ' '.join(text)
        text = text.lstrip(" ").rstrip(" ")
        return text

    def _tokenize_text(self, text):
        sequences = self.tokenizer.texts_to_sequences([text])
        padded_sequences = pad_sequences(sequences, maxlen=self.max_length, padding='post', truncating='post')
        return padded_sequences

    def fit(self, X, y):
        X_summarized = [self._summarize(article) for article in X]
        X_cleaned = [self._clean_text(article) for article in X_summarized]
        X_tokenized = [self._tokenize_text(article) for article in X_cleaned]

        # Add print statements to check sequence lengths
        for article, summarized, cleaned, tokenized in zip(X, X_summarized, X_cleaned, X_tokenized):
            print(X_summarized)
            print(f"Original: {len(article)}, Summarized: {len(summarized)}, Cleaned: {len(cleaned)}, Tokenized: {len(tokenized)}")

        self.model.fit(X_tokenized, y, epochs=10, batch_size=32)
        return self


    def transform(self, X):
        X_summarized = [self._summarize(article) for article in X]
        X_cleaned = [self._clean_text(article) for article in X_summarized]
        X_tokenized = [self._tokenize_text(article) for article in X_cleaned]
        return X_tokenized

    def predict(self, X):
        X_tokenized = self.transform(X)
        return self.model.predict(X_tokenized)

In [20]:
# Example usage:
max_words = 1000
max_length = 200
embedding_dim = 50
hidden_layer_size = 128
simple_classifier = CustomClassifier(max_words, max_length, embedding_dim, hidden_layer_size)

# TRAINING

In [None]:
# Assuming num_classes is defined and X_train, y_train are available
simple_classifier.fit(X_train, y_train)

# EVALUATION