In [None]:
# Imports
import pickle


import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import f1_score

from analyzer import data_cleaner
from analyzer.data_transformation import TfidfDataTransformer, BagOfWordsTransformer, DataTransformer

from imblearn.under_sampling import TomekLinks,RandomUnderSampler, CondensedNearestNeighbour,EditedNearestNeighbours


In [None]:
#Load data

def load_and_clean_data(filepath, vectorizer_output='models/vectorizer.sav', **kwargs) -> (pd.DataFrame, DataTransformer):
    # Load data
    df = pd.read_csv(filepath, **kwargs)
    # Clean data
        # Remove @ mentions
    df['clean_text'] = np.vectorize(data_cleaner.clean_mentions)(df['text'])
        # Remove non alfabet chars
    df['clean_text'] = df['clean_text'].str.replace("[^a-zA-Z#]", " ")
        # Remove short words
    df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    # Transform
    tr = TfidfDataTransformer()
    # Stemming
    df['clean_text'] = tr.stemming(df['clean_text'])

    df_tfidf = tr.transform(df['clean_text'])
    pickle.dump(tr.vectorizer, open(vectorizer_output, 'wb'))


    return df, df_tfidf, tr

def balance_data(X, y, balancer = RandomUnderSampler(sampling_strategy='not minority',random_state=1337)) -> pd.DataFrame:
    X_balanced, y_balanced = balancer.fit_resample(X, y)
    return X_balanced, y_balanced



def fit_model(X, y, model_type, model_output='../models/model.sav', **kwargs):
    model = model_type(**kwargs)
    model.fit(X,y)
    if model_output is not None:
        pickle.dump(model, open(model_output, 'wb'))
    return model

proba_models = set(['LogisticRegression'])
def predict(model, data):
    model_name = type(model).__name__
    if model_name in proba_models:
        predictions = model.predict_proba(data)
    else:
        predictions = model.predict(data)
    return predictions

def metrics(predictions, true_values, output_file, name_prefix=""):
    predictions_int =  predictions[:,1]>=0.3
    # Calculate metrics
    metrics = {
        'f1_score': f1_score(y_valid, prediction_int)
    }
    # Plot?? mby

    # Write to outputfile.
    with open(f'{name_prefix}_out.out', 'w') as f:
            f.write({} + '\n')
    return metrics

In [None]:
def __model_tuning__(X_train, y_train, X_valid, y_valid, model_type,params, output_file, data_label=""):
    model_output = f'{model_type}-$-{data_label}-{datetime.now()}.sav'
    for p in ParameterGrid(params):
        model_output = model_output.replace('$', str(p).strip())
        model = fit_model(X_train, y_train, model_type, model_output, p)
        predictions = predict(model, X_valid)
        output = metrics(predictions, y_valid, output_file)
        return output


In [None]:
df, df_tfidf, tr = load_and_clean_data('data/train.csv')

X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=42)

X_train_balanced, y_train_balanced = balance_data(X_train, y_train)


