In [1]:
import pickle
import os

from re import subn
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, average_precision_score, accuracy_score
import matplotlib.pyplot as plt

from analyzer import data_cleaner
from analyzer.data_transformation import TfidfDataTransformer, BagOfWordsTransformer, DataTransformer

from imblearn.under_sampling import TomekLinks, RandomUnderSampler, CondensedNearestNeighbour,EditedNearestNeighbours
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, ComplementNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor



In [2]:
# Run this cell if you want to ignore warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.linalg import LinAlgWarning
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=LinAlgWarning, module='sklearn')
warnings.simplefilter('ignore', category=ConvergenceWarning)


In [3]:
def load_and_clean_data(filepath, rename_dict=None, vectorizer_output='models/vectorizer.sav', tr=None, **kwargs) -> (pd.DataFrame, DataTransformer):
    # Load data
    df = pd.read_csv(filepath, **kwargs)

    if rename_dict is not None:
        df.rename(columns=rename_dict, inplace=True)
    #display(df.head(5))
    # Clean data
        # Remove @ mentions
    df['clean_text'] = np.vectorize(data_cleaner.clean_mentions)(df['text'])
        # Remove non alfabet chars
    df['clean_text'] = df['clean_text'].str.replace("[^a-zA-Z#]", " ")
        # Remove short words
    df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    # Transform
    #tr = TfidfDataTransformer()
    if tr is None:
        tr = BagOfWordsTransformer()
    # Stemming
    df['clean_text'] = tr.stemming(df['clean_text'])

    tr.vectorizer_fit(df['clean_text'])
    df_tfidf = tr.transform(df['clean_text'])
    pickle.dump(tr.vectorizer, open(vectorizer_output, 'wb'))


    return df, df_tfidf, tr

def balance_data(X, y, balancer = RandomUnderSampler(sampling_strategy='not minority',random_state=1337)) -> pd.DataFrame:
    X_balanced, y_balanced = balancer.fit_resample(X, y)
    return X_balanced, y_balanced



def fit_model(X, y, model_type, params, model_output='../models/model.sav', ):
    model = model_type(**params)
    model.fit(X,y)
    if model_output is not None:
        pickle.dump(model, open(model_output, 'wb'))
    return model

proba_models = set(['LogisticRegression', 'BernoulliNB', 'MLPClassifier', 'ComplementNB'])
def predict(model, data):
    model_name = type(model).__name__
    model_type = 'probabilistic' if model_name in proba_models else 'other'
    if model_name in proba_models:
        predictions = model.predict_proba(data)
    else:
        predictions = model.predict(data)
    return predictions, model_type

def metrics(predictions, true_values, name_prefix="", plot=True, model_class='probabilistic', output_df=None):

    if model_class == 'probabilistic':
        predictions_int =  predictions[:,1]>=0.5
        predictions = predictions[:,1]
    else:
        predictions_int = predictions >= 0.5
    # Calculate metrics
    tn, fp, fn, tp = confusion_matrix(true_values, predictions_int).ravel()
    metrics = {
        'f1_score': f1_score(true_values, predictions_int),
        #'confusion_matrix': confusion_matrix(true_values, predictions_int),
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'roc_auc_score': roc_auc_score(true_values, predictions),
        'average_precision_score': average_precision_score(true_values, predictions),
        'accuracy_score': accuracy_score(true_values, predictions_int),
        'name': name_prefix
    }
    # Plot?? mby
    if plot:
        ConfusionMatrixDisplay.from_predictions(true_values, predictions_int)
        plt.show()
    # Write to outputfile.
    if output_df is None:
        output_df = pd.DataFrame.from_records(metrics, index=[0])
    else:
        output_df = output_df.append(pd.DataFrame.from_dict(metrics, orient='index').T)
    with open(f'out/results/{name_prefix}_out.out', 'w') as f:
        f.write(str(metrics))
    return metrics, output_df

In [4]:
def clean_folders():
    for folder, end in [('models', '.sav'), ('out/results', '.out')]:
        filelist = [ f for f in os.listdir(folder) if f.endswith(end) ]
        for f in filelist:
            os.remove(os.path.join(folder, f))

In [5]:
def analyze_sentence(text, model_file, vectorizer_file):
    text = pd.Series([text])
    #text = np.vectorize(data_cleaner.clean_mentions)(text)
    # Remove non alfabet chars
    text = text.str.replace("[^a-zA-Z#]", " ")
    # Remove short words
    text = text.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    # Transform
    tr = TfidfDataTransformer(vectorizer_file)
    # Stemming
    text = tr.stemming(text)
    df_tfidf = tr.transform(text)
    model = pickle.load(open(f'{model_file}', 'rb'))
    return model.predict_proba(df_tfidf)


In [6]:
def __model_tuning__(X_train, y_train, X_valid, y_valid, model_type,params, output_df, data_label="", verbose=False, plot=False):
    output_name = f'{model_type.__name__}-$-{data_label}'
    _model_output = f'models/{output_name}.sav'
    outputs = []
    if verbose:
        print(f'Starting tuning of {model_type.__name__} with data labeled: {data_label}')
        print('===============')
    parameter_grid = ParameterGrid(params)
    for i, p in enumerate(parameter_grid):
        param_string = subn("[{}',:]","","".join(str(p).split()))[0]
        if verbose:
            print(f'Params {i+1}/{len(parameter_grid)}: {p}')
        model_output = _model_output.replace('$',param_string)
        model = fit_model(X_train, y_train, model_type, params=p, model_output=model_output)
        predictions, model_class = predict(model, X_valid)
        print(predictions)
        output, output_df = metrics(predictions, y_valid, output_name.replace('$',param_string),plot=plot, model_class=model_class, output_df=output_df)
        outputs.append(output)
        print(f'Outputdf: {output_df.shape}')
        if verbose:
            print(f'Output: {output}')
    if verbose:
        print(f'Tuning ended')
        print('===============')
    return outputs, output_df


In [10]:
clean_folders()

In [8]:
verbose = True
random_state = 1337

In [9]:
models = [
    ( KNeighborsRegressor, {
            'n_neighbors' : [3, 5, 10, 15, 20, 25, 30],
            'weights' : ['uniform', 'distance'],
            'leaf_size' : [ 40, 50, 60],
            'n_jobs' : [-1]
        }),
    ( SVR, {
        'kernel' : ['rbf'],
        'C' : [1, 10, 100, 1000],
        'gamma' : [0.001, 0.0001],
        'epsilon' : [0.1, 0.01, 0.001, 0.0001],
    }),
    (DecisionTreeRegressor,{
        'criterion' : ['squared_error', 'poisson'],
        'splitter' : ['best'],
        'max_depth' : [None, 10, 20, 30, 40, 50],
        'min_samples_split' : [2, 5],
    }),
    (LogisticRegression, {
        'penalty': ['none', 'l2'],
        'class_weight': [None,'balanced'],
        'n_jobs': [-1],
        'solver': ['newton-cholesky']
    }),
    (BernoulliNB, {
        'alpha' : np.arange(0.0, 1.0, 0.2),
        'fit_prior': [True, False],
        'binarize': [None]

    }),
    (MLPClassifier, {
        'hidden_layer_sizes': [(100,), (100, 100), (50, 50, 50), (50,)],
        'activation': ['identity', 'logistic'],
        'alpha': [0.0001, 0.001]
    }),
    (ComplementNB,{
        'alpha' : np.arange(0.0, 1.0, 0.2),
        'fit_prior': [True, False],
        'norm': [True, False],
    })


]

vectorizers = [
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(max_features=2000, stop_words='english')), '-tfidf'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1,2))), '-tfidf-ngram'),
    (TfidfDataTransformer(vectorizer=TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1,3))), '-tfidf-ngram3'),
    (BagOfWordsTransformer(vectorizer=CountVectorizer(max_features=2000, stop_words='english')), 'bow'),
    (BagOfWordsTransformer(vectorizer=CountVectorizer(max_features=2000, stop_words='english', ngram_range=(1,2))), '-bow-ngram'),
    (BagOfWordsTransformer(vectorizer=CountVectorizer(max_features=2000, stop_words='english', ngram_range=(1,3))), '-bow-ngram3'),
]

output_df = None
for vectorizer, vectorizer_label in vectorizers[1:2]:
    if verbose:
        print('=====================')
        print(f'=={vectorizer.__class__.__name__}==')
        print('=====================')
    df, df_tfidf, tr_old = load_and_clean_data('data/train.csv',vectorizer_output=f'models/vecs/{vectorizer_label}.sav', tr=vectorizer, rename_dict={'tweet':'text', 'label':'output'})
    X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
    X_train_balanced, y_train_balanced = balance_data(X_train, y_train)
    for model, param_grid in models[-1:]:
        if verbose:
            print('=====================')
            print(f'=={model.__name__}==')
            print('=====================')

       # df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
        #X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
       # X_train_balanced, y_train_balanced = balance_data(X_train, y_train)


        output, output_df = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model, param_grid, output_df ,data_label=vectorizer_label, verbose=verbose)
output_df.to_csv(f'out/results/Results.csv')
    #output2 = __model_tuning__(X_train, y_train, X_valid, y_valid, model, param_grid, 'lin_reg_1' ,'unba', verbose=verbose)


==TfidfDataTransformer==
==ComplementNB==
Starting tuning of ComplementNB with data labeled: -tfidf-ngram
Params 1/20: {'alpha': 0.0, 'fit_prior': True, 'norm': True}
[[0.50007143 0.49992857]
 [0.5000663  0.4999337 ]
 [0.50009549 0.49990451]
 ...
 [0.50018483 0.49981517]
 [0.499957   0.500043  ]
 [0.49996394 0.50003606]]
Outputdf: (1, 9)
Output: {'f1_score': 0.23205434183954468, 'tn': 4774, 'fp': 4135, 'fn': 48, 'tp': 632, 'roc_auc_score': 0.8585297914204406, 'average_precision_score': 0.42576409482276995, 'accuracy_score': 0.5637709875899468, 'name': 'ComplementNB-alpha0.0fit_priorTruenormTrue--tfidf-ngram'}
Params 2/20: {'alpha': 0.0, 'fit_prior': True, 'norm': False}
[[9.99999943e-01 5.67151842e-08]
 [9.99999878e-01 1.21881015e-07]
 [1.00000000e+00 3.51772301e-11]
 ...
 [1.00000000e+00 5.41447121e-14]
 [3.86401387e-01 6.13598613e-01]
 [4.27468746e-01 5.72531254e-01]]
Outputdf: (2, 9)
Output: {'f1_score': 0.39214334941419715, 'tn': 7256, 'fp': 1653, 'fn': 111, 'tp': 569, 'roc_auc_sco

In [None]:
output_df.to_csv(f'out/results/Results.csv')

In [None]:
model1_grid= {
    'penalty': ['none', 'l2'],
    'class_weight': [None, 'balanced'],
    'n_jobs': [-1],
    'solver': ['newton-cholesky'],
    'max_iter': [500]
}
model1 = LogisticRegression
df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)

balancers = [TomekLinks(sampling_strategy='majority'), TomekLinks(sampling_strategy='not minority'),  RandomUnderSampler(), SMOTE(sampling_strategy='minority'),CondensedNearestNeighbour()]
results = []
for b in balancers[:3]:
    X_train_balanced, y_train_balanced = balance_data(X_train, y_train, b)
    output = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model1, model1_grid, 'lin_reg_1' ,'bal', verbose=False)

    results.append(f'Balancer {str(b)} : {max(output, key=lambda x: x["roc_auc_score"])}')

print('========')


In [21]:
for r in results:
    print(r)

Balancer TomekLinks(sampling_strategy='majority') : {'f1_score': 0.389544150462225, 'confusion_matrix': array([[7063, 1846],
       [  69,  611]], dtype=int64), 'roc_auc_score': 0.9374362013297856, 'average_precision_score': 0.6739797660178359}
Balancer TomekLinks(sampling_strategy='not minority') : {'f1_score': 0.389544150462225, 'confusion_matrix': array([[7063, 1846],
       [  69,  611]], dtype=int64), 'roc_auc_score': 0.9374362013297856, 'average_precision_score': 0.6739797660178359}
Balancer RandomUnderSampler() : {'f1_score': 0.24647364513734224, 'confusion_matrix': array([[4865, 4044],
       [  16,  664]], dtype=int64), 'roc_auc_score': 0.9288538358434629, 'average_precision_score': 0.6292684522647592}


In [14]:

test = 'bad bad i hate this ugly thingy libtard'
test2 = 'i like this its beautiful, nice job'
m = 'LogisticRegression-class_weightNonen_jobs-1penaltynonesolvernewton-cholesky-bal-1680105414.123568.sav'
analyze_sentence(test2, model_file=f'models/{m}', vectorizer_file='models/vectorizer.sav')


array([[0.73975098, 0.26024902]])

In [20]:
import nltk

df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
X_train_balanced, y_train_balanced = balance_data(X_train, y_train)


classifier = nltk.classify.SklearnClassifier(LogisticRegression())
# Transform two arrays (X_train_balanced, y_train_balanced) into a list of tuples
data = list(zip(X_train_balanced, y_train_balanced))
valid_data = list(zip(X_valid, y_valid))
classifier.train(data)
accuracy = nltk.classify.accuracy(classifier, valid_data)
print(f"{accuracy:.2%}")


AttributeError: 'int' object has no attribute 'items'

In [5]:
# Load Results.csv as dataframe
df = pd.read_csv('out/results/Results_complete.csv')
# Sort by roc_auc_score
df = df.sort_values(by=['roc_auc_score'], ascending=False)
# Print top 10
print(f'ROC AUC Score')
display(df[['roc_auc_score', 'name']].head(10))
# Sort by f1_score
df = df.sort_values(by=['f1_score'], ascending=False)
# Print top 10
print(f'F1 Score')
print(df[['f1_score', 'name']].head(1)['name'])

ROC AUC Score


Unnamed: 0,roc_auc_score,name
147,0.936441,ComplementNB-alpha0.8fit_priorFalsenormFalse--...
145,0.936441,ComplementNB-alpha0.8fit_priorTruenormFalse--t...
111,0.936394,BernoulliNB-alpha0.8binarizeNonefit_priorFalse...
110,0.936389,BernoulliNB-alpha0.8binarizeNonefit_priorTrue-...
143,0.936195,ComplementNB-alpha0.6000000000000001fit_priorF...
141,0.936195,ComplementNB-alpha0.6000000000000001fit_priorT...
109,0.936161,BernoulliNB-alpha0.6000000000000001binarizeNon...
108,0.936161,BernoulliNB-alpha0.6000000000000001binarizeNon...
146,0.935747,ComplementNB-alpha0.8fit_priorFalsenormTrue--t...
144,0.935747,ComplementNB-alpha0.8fit_priorTruenormTrue--tfidf


F1 Score
200    SVR-C10epsilon0.01gamma0.001kernelrbf--tfidf-n...
Name: name, dtype: object


In [None]:
f1_win = 'SVR-C10epsilon0.01gamma0.001kernelrbf--tfidf-ngram'
acc_win = 'KNeighborsRegressor-leaf_size40n_jobs-1n_neighbors10weightsdistance--tfidf-ngram'

In [None]:
df, df_tfidf, tr_old = load_and_clean_data('data/train.csv',vectorizer_output=f'models/vecs/v1.sav', tr=vectorizer, rename_dict={'tweet':'text', 'label':'output'})
X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
X_train_balanced, y_train_balanced = balance_data(X_train, y_train)
for model, param_grid in models:
    if verbose:
        print('=====================')
        print(f'=={model.__name__}==')
        print('=====================')

    # df, df_tfidf, tr = load_and_clean_data('data/train.csv', rename_dict={'tweet':'text', 'label':'output'})
    #X_train, X_valid, y_train, y_valid = train_test_split(df_tfidf, df['output'],test_size=0.3,random_state=random_state)
    # X_train_balanced, y_train_balanced = balance_data(X_train, y_train)


    output, output_df = __model_tuning__(X_train_balanced, y_train_balanced, X_valid, y_valid, model, param_grid, output_df ,data_label=vectorizer_label, verbose=verbose)