In [10]:
import os
import re
import joblib
import ast
import spacy
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM, Flatten, concatenate
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
#################################### Utility Functions ####################################
def secure_valid_modes(mode):
    """ Function for securing that the mode is one of the [dev, stg, prd]
        This tries to mimic the format of a working code base
    """
    if mode not in ["dev", "stg", "prd"]:
        raise Exception(
        """The provided mode is not in [dev, stg, prd]
           Please Provide one of the above modes
        """
        )
        
def make_mode_file(mode):
    """ 
    Function for creating the appropriate mode directory    
    """
    outdir = mode
    secure_valid_modes(mode)
    if not os.path.exists(outdir):
        print(f"Making directory for mode: {mode}")
        os.mkdir(outdir)
    else:
        print(f"Directory for mode: {mode} already exists")

def stopword_and_punct_removal(text, nlp):
    """ 
    Function for removing stopwords and punctuation from a given text
    """
    return " ".join([token.text for token in nlp(text) 
                     if not (token.is_stop | token.is_punct)]
                   )

def lemmatization(text, nlp):
    """ 
    Function applying lemmatization on text
    """
    return ' '.join([word.lemma_ for word in nlp(text)])

def get_vector_embeddings(text, nlp):
    """ 
    Function to apply vector embeddings
    """
    return nlp(text).vector.tolist()

def string_list_to_array(list_obj):
    "Converts a string list to array"
    return np.array(ast.literal_eval(list_obj))

def transform_dataframe_list_columns_to_arrays(dataframe,
                                               list_of_string_cols
                                              ):
    """
    # TODO: Remove this functionality and go directly to the function below.
    Transforms all of the provided columns of a dataframe to array-columns.
    Keep in mind that, after the application of this, the shape of a single row
    is of dimensionality of 1.
    """
    
    for column in list_of_string_cols:
        dataframe[column] = dataframe[column].apply(lambda x: string_list_to_array(x))
        
    return dataframe

def get_array_from_pandas_col_of_arrays(dataframe,
                                        column_name
                                        ):
    """
    Transforms a pandas dataframe column of arrays to a single array,
    with dimensionality of (len(dataframe), len(array))
    """
    
    array = np.array(dataframe[column_name].to_list())
    
    return array
#################################### Main Objects ############################################

class Settings:
    """
    Includes the main settings for the project
    """
    
    def __init__(self, mode):
        self.mode = mode
        self.sources_sinks_path = './sources_sinks'
        self.train_data_path = self.sources_sinks_path + '/train_data.csv'
        self.test_data_path = self.sources_sinks_path + '/unseen_test_data.csv'

        
class DataPreprocessing:
    """
    Defines the main functionality for preprocessing
    """

    def __init__(self, mode, settings, training_mode = True):
        self.mode = mode
        self.settings = Settings(mode=mode)
        self.greek_nlp = spacy.load("el_core_news_lg")
        self.eng_nlp = spacy.load("en_core_web_md")
        self.greek_columns_list = ['title' , 'content']
        self.english_columns_list = ['theme' , 'site_title']
        self.label_column = 'label'
        self.columns_to_apply_stopword_removal = ['title' , 'content']
        self.columns_to_apply_lemmatization = ['title' , 'content', 'theme' , 'site_title']
        self.columns_to_vectorize = ['title' , 'content', 'theme' , 'site_title']
        self.training_mode = training_mode
        self.label_mapping_dataframe = None
        
    def _get_appropriate_nlp(self, column):
        if column in self.greek_columns_list:
                    nlp = self.greek_nlp
        else:
            nlp = self.eng_nlp
            
        return nlp
    
    def create_site_titles_and_themes(self, dataset):
        list_of_site_titles = []
        list_of_themes = []
        
        for index, row in dataset.iterrows():
            url_str = row['url']
            url_suffix = re.search('\.gr|\.com|\.net', url_str).group(0)
            split_sentece = url_str.split(url_suffix)
            site_title = split_sentece[0].replace('https://', "").replace('http://', "").replace('www.', "")
            theme = split_sentece[1].split('/')[1]
            list_of_site_titles.append(site_title)
            list_of_themes.append(theme)
            
        dataset['site_title'] = list_of_site_titles
        dataset['theme'] = list_of_themes
        dataset.drop('url', axis = 1, inplace = True)
        
        return dataset
    
    def remove_stopwords_and_punctuation(self, dataset):
        for column in self.columns_to_apply_stopword_removal:
            nlp = self._get_appropriate_nlp(column)  
            dataset[column] = dataset[column].apply(lambda text:
                                                    stopword_and_punct_removal(text, nlp)
                                                    )
        return dataset
    
        
    def lemmatization(self, dataset):
        for column in self.columns_to_apply_lemmatization:
            nlp = self._get_appropriate_nlp(column)                         
            dataset[column] = dataset[column].apply(lambda text:
                                                    lemmatization(text, nlp)
                                                    )
        return dataset
    
    
    def construct_embeddings(self, dataset):
        for column in self.columns_to_vectorize:
            nlp = self._get_appropriate_nlp(column)
            dataset[column] = dataset[column].apply(lambda text:
                                                    get_vector_embeddings(text, nlp)
                                                    )
        return dataset
    
    def encode_target_labels(self, dataset):
        dataset['original_label_col'] = dataset[self.label_column]
        if self.training_mode:
            le = LabelEncoder()
            dataset[self.label_column] = le.fit(dataset[self.label_column]).transform(dataset[self.label_column])
            joblib.dump(le, self.settings.sources_sinks_path + '/label_encoder',compress=9)
        else:
            le=joblib.load(self.settings.sources_sinks_path + '/label_encoder')
            dataset[self.label_column] = le.transform(self.label_column)
            
        self.label_mapping_dataframe = (dataset
                                        .sort_values(by = [self.label_column])
                                        .drop_duplicates([self.label_column])
                                        .loc[:, ["original_label_col" , self.label_column]]
                                        )
        
        return dataset
    
    def write_mapping_dataframe_to_sinks(self):
        self.label_mapping_dataframe.to_csv(settings.sources_sinks_path + 
                                            '/label_mapping_dataframe.csv'
                                           )
    
    def write_to_sinks(self, dataset, write_path):
        dataset.to_csv(write_path)
        
class DummyBaseline:
    """
    Defines the main functionality of a baseline dummy classifier
    """
    def __init__(self, 
                 mode, 
                 X_train,
                 X_val,
                 y_train,
                 y_val,
                 settings):
        
        self.predictions = None
        self.mode = mode
        self.settings = settings
        self.classifier = DummyClassifier(strategy="uniform")
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        
    def model_fit(self):
            self.classifier.fit(self.X_train, self.y_train)
        
    def model_predict(self):
        self.predictions = self.classifier.predict(self.X_val)
        
    
    def write_predictions(self):
        pd.DataFrame(self.predictions, 
                     columns = ['predictions']).to_csv(self.settings.sources_sinks_path +
                                '/dummy_classifier_val_predictions.csv'
                               )
    
    def execute_pipeline(self):
        self.model_fit()
        self.model_predict()
        self.write_predictions()
    
class SingleFeatureDecisionTreeBaseline:
    """ 
    Defines the main functionality of a baseline decision tree classifier
    """

    def __init__(self, 
                 mode, 
                 X_train,
                 X_val,
                 y_train,
                 y_val,
                 settings
                ):
        self.predictions = None
        self.mode = mode
        self.settings = settings
        self.classifier = DecisionTreeClassifier()
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
    
    
    def model_fit(self):
        self.classifier.fit(self.X_train, self.y_train)

        
    def model_predict(self):
        self.predictions = self.classifier.predict(self.X_val)
        
        
    def write_predictions(self):
        pd.DataFrame(self.predictions, 
                     columns = ['predictions']).to_csv(self.settings.sources_sinks_path +
                                '/decision_tree_val_predictions.csv'
                                )
        
    def execute_pipeline(self):
        self.model_fit()
        self.model_predict()
        self.write_predictions()
        

class MultiInputDenseDL:
    """ 
    Defines the main functionality of multi input Dense Deep Learning Classifier.
    Each of the 4 features serves as a single input, and all of them are concatenated to a
    final layer.
    """

    def __init__(self, 
                 mode, 
                 X_train,
                 X_val,
                 y_train,
                 y_val,
                 settings
                ):
        self.predictions = None
        self.mode = mode
        self.settings = settings
        self.X_train = X_train
        self.X_val = X_val
        self.num_labels = len(np.unique(y_train))
        self.dense_units = 32
        self.rate = 0.5
        self.model = None
        self.epochs = 100
        self.batch_size = 128
        self.callback = EarlyStopping(monitor='val_accuracy',
                                      patience=20)
    
    
    def preprocess_labels(self):
        self.y_train = to_categorical(y_train)
        self.y_val = to_categorical(y_val)
    
    def model_construct(self):
        ### Constructing Title
        input_title = Input(shape = (300,1))
        title = Dense(units=self.dense_units, activation="relu")(input_title)
        title = Dropout(rate = self.rate)(title)
        ### Constructing content
        input_content = Input(shape = (300,1))
        content = Dense(units = self.dense_units, activation="relu")(input_content)
        content = Dropout(rate = self.rate)(content)
        ### Constructing site_title
        input_site_title = Input(shape = (300,1))
        site_title = Dense(units = self.dense_units, activation="relu")(input_site_title)
        site_title = Dropout(rate = self.rate)(site_title)
        ### Constructing theme
        input_theme = Input(shape = (300,1))
        theme = Dense(units = self.dense_units, activation="relu")(input_theme)
        theme = Dropout(rate = self.rate)(theme)
        # Concatenating layers
        merge=concatenate([title, content, site_title, theme])
        # feature maps to vector before connecting to Dense
        flatten = Flatten()(merge)
        outputs = Dense(self.num_labels, activation='softmax')(flatten)
        self.model = Model([input_title, input_content, input_site_title, input_theme], outputs)
    
    def model_compile(self):
        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'],
                          )
    
    def model_fit(self):
        self.model.fit([get_array_from_pandas_col_of_arrays(X_train, "title"),
                        get_array_from_pandas_col_of_arrays(X_train, "content"),
                        get_array_from_pandas_col_of_arrays(X_train, "site_title"),
                        get_array_from_pandas_col_of_arrays(X_train, "theme"),
                        ],
                        self.y_train,
                        validation_split=0.2,
                        epochs=self.epochs,
                        batch_size = self.batch_size,
                        callbacks = [self.callback],
                        verbose=1
                       )
    
    def model_predict(self):
        self.predictions = self.model.predict([get_array_from_pandas_col_of_arrays(X_val, "title"),
                                               get_array_from_pandas_col_of_arrays(X_val, "content"),
                                               get_array_from_pandas_col_of_arrays(X_val, "site_title"),
                                               get_array_from_pandas_col_of_arrays(X_val, "theme"),
                                               ]
                                              )
        
        self.predictions = [np.argmax(x) for x in self.predictions]
    
    def write_predictions(self):
        pd.DataFrame(self.predictions, 
                     columns = ['predictions']).to_csv(self.settings.sources_sinks_path +
                                '/multi_dense_val_predictions.csv'
                                )
    
    def save_model(self):
        self.model.save(self.settings.sources_sinks_path + '/dl_model.keras')
    
    def execute_pipeline(self):
        self.preprocess_labels()
        self.model_construct()
        self.model_compile()
        self.model_fit()
        self.model_predict()
        self.write_predictions()
        self.save_model()