In [1]:
import pickle
import os

from re import subn
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, average_precision_score, accuracy_score
import matplotlib.pyplot as plt

from analyzer import data_cleaner
from analyzer.data_transformation import TfidfDataTransformer, BagOfWordsTransformer, DataTransformer

from imblearn.under_sampling import TomekLinks, RandomUnderSampler, CondensedNearestNeighbour,EditedNearestNeighbours
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor



In [2]:
# Run this cell if you want to ignore warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.linalg import LinAlgWarning
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=LinAlgWarning, module='sklearn')
warnings.simplefilter('ignore', category=ConvergenceWarning)


In [3]:
def load_and_clean_data(filepath, rename_dict=None, vectorizer_output='models/vectorizer.sav', tr=None, **kwargs) -> (pd.DataFrame, DataTransformer):
    # Load data
    df = pd.read_csv(filepath, **kwargs)
    # Take only first n rows from df

    df = df.head(30000)


    if rename_dict is not None:
        df.rename(columns=rename_dict, inplace=True)
    #display(df.head(5))
    # Clean data
        # Remove @ mentions
    df['clean_text'] = np.vectorize(data_cleaner.clean_mentions)(df['text'])
        # Remove non alfabet chars
    df['clean_text'] = df['clean_text'].str.replace("[^a-zA-Z#]", " ")
        # Remove short words
    df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    # Transform
    #tr = TfidfDataTransformer()
    if tr is None:
        tr = BagOfWordsTransformer()
    # Stemming
    df['clean_text'] = tr.stemming(df['clean_text'])

    tr.vectorizer_fit(df['clean_text'])
    df_tfidf = tr.transform(df['clean_text'])
    pickle.dump(tr.vectorizer, open(vectorizer_output, 'wb'))


    return df, df_tfidf, tr

def balance_data(X, y, balancer = RandomUnderSampler(sampling_strategy='not minority',random_state=1337)) -> pd.DataFrame:
    X_balanced, y_balanced = balancer.fit_resample(X, y)
    return X_balanced, y_balanced



def fit_model(X, y, model_type, params, model_output='../models/model.sav', ):
    model = model_type(**params)
    model.fit(X,y)
    if model_output is not None:
        pickle.dump(model, open(model_output, 'wb'))
    return model

proba_models = set(['LogisticRegression', 'BernoulliNB', 'MLPClassifier', 'ComplementNB'])
def predict(model, data):
    model_name = type(model).__name__
    model_type = 'probabilistic' if model_name in proba_models else 'other'
    if model_name in proba_models:
        predictions = model.predict_proba(data)
    else:
        predictions = model.predict(data)
    return predictions, model_type

def metrics(predictions, true_values, name_prefix="", plot=True, model_class='probabilistic', output_df=None, params_full=''):

    if model_class == 'probabilistic':
        predictions_int =  predictions[:,1]>=0.5
        predictions = predictions[:,1]
    else:
        predictions_int = predictions >= 0.5
    # Calculate metrics
    tn, fp, fn, tp = confusion_matrix(true_values, predictions_int).ravel()
    metrics = {
        'f1_score': f1_score(true_values, predictions_int),
        #'confusion_matrix': confusion_matrix(true_values, predictions_int),
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'roc_auc_score': roc_auc_score(true_values, predictions),
        'average_precision_score': average_precision_score(true_values, predictions),
        'accuracy_score': accuracy_score(true_values, predictions_int),
        'name': name_prefix,
        'params_full': params_full

    }
    # Plot?? mby
    if plot:
        ConfusionMatrixDisplay.from_predictions(true_values, predictions_int)
        plt.show()
    # Write to outputfile.
    if output_df is None:
        output_df = pd.DataFrame.from_records(metrics, index=[0])
    else:
        output_df = output_df.append(pd.DataFrame.from_dict(metrics, orient='index').T)
    with open(f'out/results/{name_prefix}_out.out', 'w') as f:
        f.write(str(metrics))
    return metrics, output_df

In [4]:
def clean_folders():
    for folder, end in [('models', '.sav'), ('out/results', '.out')]:
        filelist = [ f for f in os.listdir(folder) if f.endswith(end) ]
        for f in filelist:
            os.remove(os.path.join(folder, f))

In [5]:
def analyze_sentence(text, model_file, vectorizer_file):
    text = pd.Series([text])
    #text = np.vectorize(data_cleaner.clean_mentions)(text)
    # Remove non alfabet chars
    text = text.str.replace("[^a-zA-Z#]", " ")
    # Remove short words
    text = text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    # Transform
    tr = TfidfDataTransformer(vectorizer_file)
    # Stemming
    text = tr.stemming(text)
    df_tfidf = tr.transform(text)
    model = pickle.load(open(f'{model_file}', 'rb'))
    return model.predict_proba(df_tfidf)


In [6]:
def __model_tuning__(X_train, y_train, X_valid, y_valid, model_type,params, output_df, data_label="", verbose=False, plot=False):
    output_name = f'{model_type.__name__}-$-{data_label}'
    _model_output = f'models/{output_name}.sav'
    outputs = []
    if verbose:
        print(f'Starting tuning of {model_type.__name__} with data labeled: {data_label}')
        print('===============')
    parameter_grid = ParameterGrid(params)
    for i, p in enumerate(parameter_grid):
        param_string = subn("[{}',:]","","".join(str(p).split()))[0]
        if verbose:
            print(f'Params {i+1}/{len(parameter_grid)}: {p}')
        model_output = _model_output.replace('$',param_string)
        model = fit_model(X_train, y_train, model_type, params=p, model_output=model_output)
        predictions, model_class = predict(model, X_valid)
        print(predictions)
        output, output_df = metrics(predictions, y_valid, output_name.replace('$',param_string),plot=plot, model_class=model_class, output_df=output_df, params_full=str(p))
        outputs.append(output)
        print(f'Outputdf: {output_df.shape}')
        if verbose:
            print(f'Output: {output}')
    if verbose:
        print(f'Tuning ended')
        print('===============')
    return outputs, output_df


In [7]:
clean_folders()

In [8]:
verbose = True
random_state = 1337

In [9]:
models = [
    ( KNeighborsRegressor, {
            'n_neighbors' : [3, 5, 10, 15, 20, 25, 30],
            'weights' : ['uniform', 'distance'],
            'leaf_size' : [ 40, 50, 60],
            'n_jobs' : [-1]
        }),
    ( LinearSVR, {
        'loss' : ['epsilon_insensitive', 'squared_epsilon_insensitive'],
        'C' : [1, 10, 0.5],
        'epsilon' : [0.1, 0.01],
    }),
    (DecisionTreeRegressor,{
        'criterion' : ['squared_error', 'poisson'],
        'splitter' : ['best'],
        'max_depth' : [None, 10, 20, 30, 40, 50],
        'min_samples_split' : [2, 5],
    }),
    (LogisticRegression, {
        'penalty': ['none', 'l2', 'l1'],
        'C': [1, 0.5, 5],
        'class_weight': [None,'balanced'],
        'n_jobs': [-1],
        'solver': ['saga']
    }),
    (BernoulliNB, {
        'alpha' : np.arange(0.0, 1.0, 0.2),
        'fit_prior': [True, False],
        'binarize': [None]

    }),
    (MLPClassifier, {
        'hidden_layer_sizes': [(10,), (10, 10), (10, 10, 10), (50,), (50, 20), (100,), (100,50)],
        'activation': [ 'logistic'],
        'alpha': [ 0.001]
    }),
    (ComplementNB,{
        'alpha' : np.arange(0.0, 1.0, 0.2),
        'fit_prior': [True, False],
        'norm': [True, False],
    }),
    (GaussianNB,
     {
         'var_smoothing': [1,0.5,0.01, 2, 5, 10, 20, 50, 100]
     })



]

vectorizers = [
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(max_features=2000, stop_words='english')), '-tfidf'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1,2))), '-tfidf-ngram'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1,3))), '-tfidf-ngram3'),
    (BagOfWordsTransformer(vectorizer=CountVectorizer(max_features=2000, stop_words='english')), 'bow'),
    (BagOfWordsTransformer(vectorizer=CountVectorizer(max_features=2000, stop_words='english', ngram_range=(1,2))), '-bow-ngram'),
    (BagOfWordsTransformer(vectorizer=CountVectorizer(max_features=2000, stop_words='english', ngram_range=(1,3))), '-bow-ngram3'),
]
clean_folders()
output_df = None


for vectorizer, vectorizer_label in vectorizers[1:2]:
    if verbose:
        print('=====================')
        print(f'=={vectorizer.__class__.__name__}==')
        print('=====================')
   #df, df_tfidf, tr_old = load_and_clean_data('data/train.csv',vectorizer_output=f'models/vecs/{vectorizer_label}.sav', tr=vectorizer, rename_dict={'tweet':'text', 'label':'output'})
    df, df_tfidf, tr_old = load_and_clean_data('data/train2nonneg.csv',vectorizer_output=f'models/vecs/{vectorizer_label}.sav', tr=vectorizer, rename_dict={'clean_comment':'text', 'category':'output'})
    X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
    X_train_balanced, y_train_balanced = balance_data(X_train, y_train)
    for model, param_grid in models[1:2]:
        if verbose:
            print('=====================')
            print(f'=={model.__name__}==')
            print('=====================')

       # df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
        #X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
       # X_train_balanced, y_train_balanced = balance_data(X_train, y_train)


        output, output_df = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model, param_grid, output_df ,data_label=vectorizer_label, verbose=verbose)
output_df.to_csv(f'out/results/Results.csv')
    #output2 = __model_tuning__(X_train, y_train, X_valid, y_valid, model, param_grid, 'lin_reg_1' ,'unba', verbose=verbose)


==TfidfDataTransformer==
==LinearSVR==
Starting tuning of LinearSVR with data labeled: -tfidf-ngram
Params 1/12: {'C': 1, 'epsilon': 0.1, 'loss': 'epsilon_insensitive'}
[1.1664251  0.98774041 0.57985456 ... 0.08642415 0.49651568 0.04322998]
Outputdf: (1, 9)
Output: {'f1_score': 0.6912727272727273, 'tn': 3634, 'fp': 1149, 'fn': 549, 'tp': 1901, 'roc_auc_score': 0.846456455046999, 'average_precision_score': 0.7493377376135256, 'accuracy_score': 0.7652426379095811, 'name': 'LinearSVR-C1epsilon0.1lossepsilon_insensitive--tfidf-ngram'}
Params 2/12: {'C': 1, 'epsilon': 0.1, 'loss': 'squared_epsilon_insensitive'}
[1.13264958 0.76382272 0.52010434 ... 0.17684802 0.5292982  0.09590325]
Outputdf: (2, 9)
Output: {'f1_score': 0.7176104086494411, 'tn': 3734, 'fp': 1049, 'fn': 492, 'tp': 1958, 'roc_auc_score': 0.873907674715297, 'average_precision_score': 0.7902620215207197, 'accuracy_score': 0.7869487073137011, 'name': 'LinearSVR-C1epsilon0.1losssquared_epsilon_insensitive--tfidf-ngram'}
Params 3/1

In [10]:
output_df.to_csv(f'out/results/Results.csv')

In [16]:
model1_grid= {
    'penalty': ['l2'],
    'class_weight': [None],
    'n_jobs': [-1],
    'solver': ['saga'],
}
model1 = LogisticRegression
df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
#df, df_tfidf, tr_old = load_and_clean_data('data/train2nonneg.csv',vectorizer_output=f'models/vecs/{vectorizer_label}.sav', tr=TfidfDataTransformer(vectorizer=TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1,2))), rename_dict={'clean_comment':'text', 'category':'output'})
X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)

balancers = [TomekLinks(sampling_strategy='not minority'),  RandomUnderSampler(sampling_strategy='not minority'),CondensedNearestNeighbour(sampling_strategy='not minority')]
output_df = None
for i, b in enumerate(balancers):
    print(f'processing{i}/3')
    X_train_balanced, y_train_balanced = balance_data(X_train, y_train, b)
    output, output_df = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model1, model1_grid, 'lin_reg_1' ,'bal', verbose=False)

output_df.to_csv(f'out/results/ResultsBalancers.csv')


processing0/3
[[0.02114711 0.97885289]
 [0.63102568 0.36897432]
 [0.68653856 0.31346144]
 ...
 [0.86946618 0.13053382]
 [0.42822183 0.57177817]
 [0.89918324 0.10081676]]


AttributeError: 'str' object has no attribute 'append'

In [21]:
for r in results:
    print(r)

Balancer TomekLinks(sampling_strategy='majority') : {'f1_score': 0.389544150462225, 'confusion_matrix': array([[7063, 1846],
       [  69,  611]], dtype=int64), 'roc_auc_score': 0.9374362013297856, 'average_precision_score': 0.6739797660178359}
Balancer TomekLinks(sampling_strategy='not minority') : {'f1_score': 0.389544150462225, 'confusion_matrix': array([[7063, 1846],
       [  69,  611]], dtype=int64), 'roc_auc_score': 0.9374362013297856, 'average_precision_score': 0.6739797660178359}
Balancer RandomUnderSampler() : {'f1_score': 0.24647364513734224, 'confusion_matrix': array([[4865, 4044],
       [  16,  664]], dtype=int64), 'roc_auc_score': 0.9288538358434629, 'average_precision_score': 0.6292684522647592}


In [14]:

test = 'bad bad i hate this ugly thingy libtard'
test2 = 'i like this its beautiful, nice job'
m = 'LogisticRegression-class_weightNonen_jobs-1penaltynonesolvernewton-cholesky-bal-1680105414.123568.sav'
analyze_sentence(test2, model_file=f'models/{m}', vectorizer_file='models/vectorizer.sav')


array([[0.73975098, 0.26024902]])

In [20]:
import nltk

df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
X_train_balanced, y_train_balanced = balance_data(X_train, y_train)


classifier = nltk.classify.SklearnClassifier(LogisticRegression())
# Transform two arrays (X_train_balanced, y_train_balanced) into a list of tuples
data = list(zip(X_train_balanced, y_train_balanced))
valid_data = list(zip(X_valid, y_valid))
classifier.train(data)
accuracy = nltk.classify.accuracy(classifier, valid_data)
print(f"{accuracy:.2%}")


AttributeError: 'int' object has no attribute 'items'

In [15]:
# Load Results.csv as dataframe
df = pd.read_csv('out/results/Resultsnewdata.csv')
# Sort by roc_auc_score
df = df.sort_values(by=['roc_auc_score'], ascending=False)
# Print top 10
print(f'ROC AUC Score')
display(df[['roc_auc_score', 'name']].head(10))
# Sort by f1_score
df = df.sort_values(by=['f1_score'], ascending=False)
# Print top 10
print(f'F1 Score')
print(df[['f1_score', 'name']].head(10))

ROC AUC Score


Unnamed: 0,roc_auc_score,name
25,0.902774,LogisticRegression-class_weightNonen_jobs-1pen...
27,0.902774,LogisticRegression-class_weightbalancedn_jobs-...
24,0.876077,LogisticRegression-class_weightNonen_jobs-1pen...
26,0.876077,LogisticRegression-class_weightbalancedn_jobs-...
40,0.870099,MLPClassifier-activationlogisticalpha0.001hidd...
38,0.86919,MLPClassifier-activationlogisticalpha0.001hidd...
39,0.861656,MLPClassifier-activationlogisticalpha0.001hidd...
41,0.860947,MLPClassifier-activationlogisticalpha0.001hidd...
43,0.857408,MLPClassifier-activationlogisticalpha0.001hidd...
37,0.856201,BernoulliNB-alpha0.8binarizeNonefit_priorFalse...


F1 Score
    f1_score                                               name
25  0.753450  LogisticRegression-class_weightNonen_jobs-1pen...
27  0.753450  LogisticRegression-class_weightbalancedn_jobs-...
24  0.713455  LogisticRegression-class_weightNonen_jobs-1pen...
26  0.713455  LogisticRegression-class_weightbalancedn_jobs-...
40  0.707308  MLPClassifier-activationlogisticalpha0.001hidd...
38  0.706912  MLPClassifier-activationlogisticalpha0.001hidd...
39  0.697590  MLPClassifier-activationlogisticalpha0.001hidd...
41  0.695243  MLPClassifier-activationlogisticalpha0.001hidd...
37  0.692946  BernoulliNB-alpha0.8binarizeNonefit_priorFalse...
36  0.692946  BernoulliNB-alpha0.8binarizeNonefit_priorTrue-...


In [10]:
vectorizers = [
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(stop_words='english')), '-tfidf'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(stop_words='english', ngram_range=(1,2))), '-tfidf-ngram'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(stop_words='english', ngram_range=(1,3))), '-tfidf-ngram3'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(stop_words='english', ngram_range=(1,4))), '-tfidf-ngram4'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(stop_words='english', ngram_range=(2,2))), '-tfidf-ngram22'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer( stop_words='english', ngram_range=(3,3))), '-tfidf-ngram33'),
]
model = ComplementNB
param_grid = {
    'alpha' : [0.8],
    'fit_prior' : [True],
    'norm' : [False]
}

params = {
    'ngram_range' : [(1,1),(1,2),(1,3),(1,4)],
    'max_features' : [100,200,400,800,1000,2000,4000,8000,10000,20000],
}

output_df = None
parameter_grid = ParameterGrid(params)
for p in parameter_grid:
    vectorizer = BagOfWordsTransformer(vectorizer=CountVectorizer(stop_words='english', **p))
    vectorizer_label = f'tfidf-{p["ngram_range"][0]}-{p["ngram_range"][1]}-{p["max_features"]}'
    df, df_tfidf, tr_old = load_and_clean_data('data/train.csv',vectorizer_output=f'models/vecs/svr--{vectorizer_label}.sav', tr=vectorizer, rename_dict={'tweet':'text', 'label':'output'})
    #df, df_tfidf, tr_old = load_and_clean_data('data/train2nonneg.csv',vectorizer_output=f'models/vecs/{vectorizer_label}.sav', tr=vectorizer, rename_dict={'clean_comment':'text', 'category':'output'})
    X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
    #X_train_balanced, y_train_balanced = balance_data(X_train, y_train)
    output, output_df = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model, param_grid, output_df ,data_label=vectorizer_label, verbose=verbose)
output_df.to_csv(f'out/results/ResultsComplementNBwithoutbalancing.csv')

Starting tuning of ComplementNB with data labeled: tfidf-1-1-100
Params 1/1: {'alpha': 0.8, 'fit_prior': True, 'norm': False}
[[0.99760305 0.00239695]
 [0.8289084  0.1710916 ]
 [0.67484247 0.32515753]
 ...
 [0.96965688 0.03034312]
 [0.5        0.5       ]
 [0.19182125 0.80817875]]
Outputdf: (1, 9)
Output: {'f1_score': 0.26130434782608697, 'tn': 5590, 'fp': 3319, 'fn': 79, 'tp': 601, 'roc_auc_score': 0.8056658666384952, 'average_precision_score': 0.22724014065712836, 'accuracy_score': 0.645635624152675, 'name': 'ComplementNB-alpha0.8fit_priorTruenormFalse-tfidf-1-1-100'}
Tuning ended
Starting tuning of ComplementNB with data labeled: tfidf-1-2-100
Params 1/1: {'alpha': 0.8, 'fit_prior': True, 'norm': False}
[[0.99710674 0.00289326]
 [0.81980337 0.18019663]
 [0.61741504 0.38258496]
 ...
 [0.96572898 0.03427102]
 [0.5        0.5       ]
 [0.17307218 0.82692782]]
Outputdf: (2, 9)
Output: {'f1_score': 0.2513931888544892, 'tn': 5353, 'fp': 3556, 'fn': 71, 'tp': 609, 'roc_auc_score': 0.793566

In [None]:
f1_win = 'SVR-C10epsilon0.01gamma0.001kernelrbf--tfidf-ngram'
acc_win = 'KNeighborsRegressor-leaf_size40n_jobs-1n_neighbors10weightsdistance--tfidf-ngram'

In [None]:
df, df_tfidf, tr_old = load_and_clean_data('data/train.csv',vectorizer_output=f'models/vecs/v1.sav', tr=vectorizer, rename_dict={'tweet':'text', 'label':'output'})
X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
X_train_balanced, y_train_balanced = balance_data(X_train, y_train)
for model, param_grid in models:
    if verbose:
        print('=====================')
        print(f'=={model.__name__}==')
        print('=====================')

    # df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
    #X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
    # X_train_balanced, y_train_balanced = balance_data(X_train, y_train)


    output, output_df = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model, param_grid, output_df ,data_label=vectorizer_label, verbose=verbose)

In [None]:
models = [
     ( LinearSVR, {
         'loss' : ['epsilon_insensitive',
                   'squared_epsilon_insensitive'],
         'C' : [100, 10, 1.0, 0.1, 0.001],
         'epsilon' : [0.1, 0.01, 0, 1,2],
         'fit_intercept' : [True, False],
     }),
    (LogisticRegression, {
        'penalty': ['none', 'l2', 'l1'],
        'C': [100,50,20, 10,5, 1.0, 0.1, 0.01, 0.001],
        'class_weight': [None,'balanced'],
        'n_jobs': [-1],
        'solver': ['saga'],
    }),
]

#df, df_tfidf, tr_old = load_and_clean_data('data/train.csv',vectorizer_output=f'models/vecs/{vectorizer_label}.sav', tr=vectorizer, rename_dict={'tweet':'text', 'label':'output'})
df, df_tfidf, tr_old = load_and_clean_data('data/train2nonneg.csv',vectorizer_output=f'models/vecs/-tfidf-ngram.sav', tr=TfidfDataTransformer(vectorizer=TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1,2))), rename_dict={'clean_comment':'text', 'category':'output'})
X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
X_train_balanced, y_train_balanced = balance_data(X_train, y_train)
output_df = None
for model, param_grid in models:
    if verbose:
        print('=====================')
        print(f'=={model.__name__}==')
        print('=====================')

    # df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
    #X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
    # X_train_balanced, y_train_balanced = balance_data(X_train, y_train)


    output, output_df = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model, param_grid, output_df ,data_label='-tfidf-ngram', verbose=verbose)
    output_df.to_csv(f'out/results/MoreResults{model.__name__}.csv')

==LinearSVR==
Starting tuning of LinearSVR with data labeled: -tfidf-ngram
Params 1/100: {'C': 100, 'epsilon': 0.1, 'fit_intercept': True, 'loss': 'epsilon_insensitive'}
[ 0.79800515  1.1636997   0.62710614 ... -0.26990771  0.417331
  0.02827326]
Outputdf: (1, 10)
Output: {'f1_score': 0.5609243697478992, 'tn': 3123, 'fp': 1660, 'fn': 848, 'tp': 1602, 'roc_auc_score': 0.710916980632939, 'average_precision_score': 0.5415073430420267, 'accuracy_score': 0.653255910410618, 'name': 'LinearSVR-C100epsilon0.1fit_interceptTruelossepsilon_insensitive--tfidf-ngram', 'params_full': "{'C': 100, 'epsilon': 0.1, 'fit_intercept': True, 'loss': 'epsilon_insensitive'}"}
Params 2/100: {'C': 100, 'epsilon': 0.1, 'fit_intercept': True, 'loss': 'squared_epsilon_insensitive'}
[ 0.91620507  1.21989233  0.71095882 ... -0.1402503   0.40001187
 -0.12031582]
Outputdf: (2, 10)
Output: {'f1_score': 0.5813708260105449, 'tn': 3197, 'fp': 1586, 'fn': 796, 'tp': 1654, 'roc_auc_score': 0.7354345961675492, 'average_preci