In [37]:
# Imports
import pickle
import os

from re import subn
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from analyzer import data_cleaner
from analyzer.data_transformation import TfidfDataTransformer, BagOfWordsTransformer, DataTransformer

from imblearn.under_sampling import TomekLinks,RandomUnderSampler, CondensedNearestNeighbour,EditedNearestNeighbours

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB



In [38]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', category=ConvergenceWarning)


In [39]:
#Load data

def load_and_clean_data(filepath, rename_dict=None, vectorizer_output='models/vectorizer.sav', **kwargs) -> (pd.DataFrame, DataTransformer):
    # Load data
    df = pd.read_csv(filepath, **kwargs)

    if rename_dict is not None:
        df.rename(columns=rename_dict, inplace=True)
    #display(df.head(5))
    # Clean data
        # Remove @ mentions
    df['clean_text'] = np.vectorize(data_cleaner.clean_mentions)(df['text'])
        # Remove non alfabet chars
    df['clean_text'] = df['clean_text'].str.replace("[^a-zA-Z#]", " ")
        # Remove short words
    df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    # Transform
    tr = TfidfDataTransformer()
    # Stemming
    df['clean_text'] = tr.stemming(df['clean_text'])

    tr.vectorizer_fit(df['clean_text'])
    df_tfidf = tr.transform(df['clean_text'])
    pickle.dump(tr.vectorizer, open(vectorizer_output, 'wb'))


    return df, df_tfidf, tr

def balance_data(X, y, balancer = RandomUnderSampler(sampling_strategy='not minority',random_state=1337)) -> pd.DataFrame:
    X_balanced, y_balanced = balancer.fit_resample(X, y)
    return X_balanced, y_balanced



def fit_model(X, y, model_type, params, model_output='../models/model.sav', ):
    model = model_type(**params)
    model.fit(X,y)
    if model_output is not None:
        pickle.dump(model, open(model_output, 'wb'))
    return model

proba_models = set(['LogisticRegression', 'BernoulliNB'])
def predict(model, data):
    model_name = type(model).__name__
    if model_name in proba_models:
        predictions = model.predict_proba(data)
    else:
        predictions = model.predict(data)
    return predictions

def metrics(predictions, true_values, name_prefix="", plot=True):
    predictions_int =  predictions[:,1]>=0.3
    # Calculate metrics
    metrics = {
        'f1_score': f1_score(true_values, predictions_int),
        'confusion_matrix': confusion_matrix(true_values, predictions_int)
    }
    # Plot?? mby
    if plot:
        ConfusionMatrixDisplay.from_predictions(true_values, predictions_int)
        plt.show()
    # Write to outputfile.
    with open(f'out/results/{name_prefix}_out.out', 'w') as f:
            f.write(str(metrics))
    return metrics

In [46]:
def clean_folders():
    for folder, end in [('models', '.sav'), ('out/results', '.out')]:
        filelist = [ f for f in os.listdir(folder) if f.endswith(end) ]
        for f in filelist:
            os.remove(os.path.join(folder, f))

In [41]:
def __model_tuning__(X_train, y_train, X_valid, y_valid, model_type,params, output_file, data_label="", verbose=False, plot=False):
    output_name = f'{model_type.__name__}-$-{data_label}-{datetime.now().timestamp()}'
    model_output = f'models/{output_name}.sav'
    outputs = []
    if verbose:
        print(f'Starting tuning of {model_type.__name__} with data labeled: {data_label}')
        print('===============')
    parameter_grid = ParameterGrid(params)
    for i, p in enumerate(parameter_grid):
        param_string = subn("[{}',:]","","".join(str(p).split()))[0]
        if verbose:
            print(f'Params {i+1}/{len(parameter_grid)}: {p}')
        model_output = model_output.replace('$',param_string)
        model = fit_model(X_train, y_train, model_type, params=p, model_output=model_output)
        predictions = predict(model, X_valid)
        output = metrics(predictions, y_valid, output_name.replace('$',param_string),plot=plot)
        outputs.append(output)
        if verbose:
            print(f'Output: {output}')
    if verbose:
        print(f'Tuning ended')
        print('===============')
    return outputs


In [47]:
clean_folders()

In [43]:
models = [
    (LogisticRegression, {
        'penalty': ['none', 'l2'],
        'class_weight': [None,'balanced'],
        'n_jobs': [-1],
        'max_iter': [400]
    }),
    (BernoulliNB, {
        'alpha': [0, 0.5, 1],
        'fit_prior': [True, False],
        'binarize': [None]

    })

]

verbose = True
random_state = 1337
for model, param_grid in models:
    if verbose:
        print('=====================')
        print(f'=={model.__name__}==')
        print('=====================')

    df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
    X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
    X_train_balanced, y_train_balanced = balance_data(X_train, y_train)

    output = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model, param_grid, 'lin_reg_1' ,'bal', verbose=verbose)
    output2 = __model_tuning__(X_train, y_train, X_valid, y_valid, model, param_grid, 'lin_reg_1' ,'unba', verbose=verbose)


==LogisticRegression==
Starting tuning of LogisticRegression with data labeled: bal
Params 1/4: {'class_weight': None, 'max_iter': 400, 'n_jobs': -1, 'penalty': 'none'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Output: {'f1_score': 0.31820895522388065, 'confusion_matrix': array([[6772, 2137],
       [ 147,  533]])}
Params 2/4: {'class_weight': None, 'max_iter': 400, 'n_jobs': -1, 'penalty': 'l2'}
Output: {'f1_score': 0.25910612325260873, 'confusion_matrix': array([[5168, 3741],
       [  22,  658]])}
Params 3/4: {'class_weight': 'balanced', 'max_iter': 400, 'n_jobs': -1, 'penalty': 'none'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Output: {'f1_score': 0.31820895522388065, 'confusion_matrix': array([[6772, 2137],
       [ 147,  533]])}
Params 4/4: {'class_weight': 'balanced', 'max_iter': 400, 'n_jobs': -1, 'penalty': 'l2'}
Output: {'f1_score': 0.25910612325260873, 'confusion_matrix': array([[5168, 3741],
       [  22,  658]])}
Tuning ended
Starting tuning of LogisticRegression with data labeled: unba
Params 1/4: {'class_weight': None, 'max_iter': 400, 'n_jobs': -1, 'penalty': 'none'}
Output: {'f1_score': 0.552821997105644, 'confusion_matrix': array([[8589,  320],
       [ 298,  382]])}
Params 2/4: {'class_weight': None, 'max_iter': 400, 'n_jobs': -1, 'penalty': 'l2'}
Output: {'f1_score': 0.5508771929824561, 'confusion_matrix': array([[8763,  146],
       [ 366,  314]])}
Params 3/4: {'class_weight': 'balanced', 'max_iter': 400, 'n_jobs': -1, 'penalty': 'none'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Output: {'f1_score': 0.4015918958031837, 'confusion_matrix': array([[7380, 1529],
       [ 125,  555]])}
Params 4/4: {'class_weight': 'balanced', 'max_iter': 400, 'n_jobs': -1, 'penalty': 'l2'}
Output: {'f1_score': 0.35243798118049613, 'confusion_matrix': array([[6700, 2209],
       [  62,  618]])}
Tuning ended
==BernoulliNB==
Starting tuning of BernoulliNB with data labeled: bal
Params 1/6: {'alpha': 0, 'binarize': None, 'fit_prior': True}
Output: {'f1_score': 0.3077694235588973, 'confusion_matrix': array([[6213, 2696],
       [  66,  614]])}
Params 2/6: {'alpha': 0, 'binarize': None, 'fit_prior': False}
Output: {'f1_score': 0.3077694235588973, 'confusion_matrix': array([[6213, 2696],
       [  66,  614]])}
Params 3/6: {'alpha': 0.5, 'binarize': None, 'fit_prior': True}
Output: {'f1_score': 0.2743787847149718, 'confusion_matrix': array([[5457, 3452],
       [  23,  657]])}
Params 4/6: {'alpha': 0.5, 'binarize': None, 'fit_prior': False}
Output: {'f1_score': 0.2743787847149718, 'confus