In [90]:
import sys
sys.path.append('../..')
import pandas as pd
from src.preprocessing import Preprocessor
from src.feature_engineering import FeatureCreator
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [91]:
preprocessing_config = {
    "1": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize', 
    'remove_short_words', 'remove_long_words', 'shorten_text'],

    "2": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize', 
    'remove_short_words', 'remove_long_words'],

    "3": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize', 
    'remove_short_words'],

    "4": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem', 'lemmatize'],

    "5": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'stem'],
    
    "6": ['lower', 'remove_punctuation', 'remove_links', 'remove_numbers',
    'translate_emoji', 'tokenize', 'remove_stopwords', 'lemmatize'],
}

In [92]:
models_config = {
    "naive_bayes": MultinomialNB(alpha=0.1),
    "logistic_regression": LogisticRegression(
                          C=3, max_iter=100, 
                          solver='sag',
                          random_state=0,
                          multi_class='multinomial')
}

In [93]:
df = pd.read_csv('../../data/prepared/prepared.csv', usecols=['text','corpus','class'])
df['corpus'] = df['corpus'].apply(lambda x: x[1:-1].replace("'", "").split(', '))
df['corpus'] = df['corpus'].apply(lambda x: ' '.join(x))
y = df['class']

for model_name, model in models_config.items():
    for min_df in [0.001, 0.01, 0.05]:
        for max_df in [0.85, 0.9, 1.0]:            
            vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df)
            X = vectorizer.fit_transform(df['corpus'])

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            clf = models_config[model_name]
            clf.fit(X_train, y_train)
            print(f'Model: {model_name}, Accuracy: {clf.score(X_test, y_test).round(4)} min_df: {min_df}, max_df: {max_df}')

Model: naive_bayes, Accuracy: 0.9007 min_df: 0.001, max_df: 0.85
Model: naive_bayes, Accuracy: 0.9007 min_df: 0.001, max_df: 0.9
Model: naive_bayes, Accuracy: 0.9007 min_df: 0.001, max_df: 1.0
Model: naive_bayes, Accuracy: 0.861 min_df: 0.01, max_df: 0.85
Model: naive_bayes, Accuracy: 0.861 min_df: 0.01, max_df: 0.9
Model: naive_bayes, Accuracy: 0.861 min_df: 0.01, max_df: 1.0
Model: naive_bayes, Accuracy: 0.7939 min_df: 0.05, max_df: 0.85
Model: naive_bayes, Accuracy: 0.7939 min_df: 0.05, max_df: 0.9
Model: naive_bayes, Accuracy: 0.7939 min_df: 0.05, max_df: 1.0
Model: logistic_regression, Accuracy: 0.9341 min_df: 0.001, max_df: 0.85
Model: logistic_regression, Accuracy: 0.9341 min_df: 0.001, max_df: 0.9
Model: logistic_regression, Accuracy: 0.9341 min_df: 0.001, max_df: 1.0
Model: logistic_regression, Accuracy: 0.9149 min_df: 0.01, max_df: 0.85
Model: logistic_regression, Accuracy: 0.9149 min_df: 0.01, max_df: 0.9
Model: logistic_regression, Accuracy: 0.9149 min_df: 0.01, max_df: 1.0

In [None]:
result= []
for model_name, model in models_config.items():
    for min_df in [1, 3, 4, 5, 6, 8, 9, 10, 1e-07, 1e-06, 5e-06 , 1e-06, 5e-05, 1e-04, 0.001]:         
        vectorizer = TfidfVectorizer(min_df=min_df)
        X = vectorizer.fit_transform(df['corpus'])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = models_config[model_name]
        clf.fit(X_train, y_train)
        result.append([model_name, clf.score(X_test, y_test).round(5), min_df])

In [None]:
result.sort(key=lambda x: x[1], reverse=True)
result = pd.DataFrame(result, columns=['model', 'accuracy', 'min_df'])

In [None]:
result[result['model'] == 'naive_bayes'].sort_values(by='accuracy', ascending=False).iloc[:3]

Unnamed: 0,model,accuracy,min_df
15,naive_bayes,0.918,5e-06
16,naive_bayes,0.91734,3.0
17,naive_bayes,0.91703,4.0


In [None]:
result[result['model'] == 'logistic_regression'].sort_values(by='accuracy', ascending=False).iloc[:3]

Unnamed: 0,model,accuracy,min_df
0,linear_regression,0.94238,8.0
1,linear_regression,0.94223,6.0
2,linear_regression,0.94216,4.0


In [None]:
print('Cross validation')   
for model_name, model in models_config.items():
    for min_df in [4, 6, 8, 3, 5e-06]:      
        vectorizer = TfidfVectorizer(min_df=min_df)
        X = vectorizer.fit_transform(df['corpus'])

        clf = models_config[model_name]
        print(f'''Model: {model_name}, min_df: {min_df}
        Accuracy: {cross_val_score(clf, X, y, cv=5).mean().round(4)}''')

Cross validation
Model: naive_bayes, min_df: 4
        Accuracy: 0.9151
Model: naive_bayes, min_df: 6
        Accuracy: 0.9146
Model: naive_bayes, min_df: 8
        Accuracy: 0.9141
Model: naive_bayes, min_df: 3
        Accuracy: 0.9154
Model: naive_bayes, min_df: 5e-06
        Accuracy: 0.9157
Model: linear_regression, min_df: 4
        Accuracy: 0.9414
Model: linear_regression, min_df: 6
        Accuracy: 0.9414
Model: linear_regression, min_df: 8
        Accuracy: 0.9413
Model: linear_regression, min_df: 3
        Accuracy: 0.9412
Model: linear_regression, min_df: 5e-06
        Accuracy: 0.9407


In [83]:
def get_accuracy(procedure_options):
    procedure = preprocessing_config[procedure_options]
    preprocessing = Preprocessor(procedure=procedure)
    corpus = df['text'].apply(lambda x: preprocessing.transform(x))
    print('Preprocessing done')
    y = df['class'][corpus.astype(bool)].values
    corpus = corpus[corpus.astype(bool)]

    X = vectorizer.fit_transform(corpus.apply(lambda x: ' '.join(x)))
    print('Vectorization done')
    for model_name, model in models_config.items():
        clf = models_config[model_name]
        clf.fit(X_train, y_train)
        acc = cross_val_score(clf, X, y, cv=5).mean().round(4)
        print(f'Model: {model_name}, Procedure: {procedure_options}, Accuracy: {acc}')
        result.append([model_name, procedure_options, acc])

In [82]:
vectorizer = TfidfVectorizer(min_df=4)
result = []

In [84]:
get_accuracy('1')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing done
Vectorization done
Model: naive_bayes, Procedure: 1, Accuracy: 0.8999
Model: logistic_regression, Procedure: 1, Accuracy: 0.933


In [85]:
get_accuracy('2')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing done
Vectorization done
Model: naive_bayes, Procedure: 2, Accuracy: 0.9004
Model: logistic_regression, Procedure: 2, Accuracy: 0.9336


In [86]:
get_accuracy('3')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing done
Vectorization done
Model: naive_bayes, Procedure: 3, Accuracy: 0.9004
Model: logistic_regression, Procedure: 3, Accuracy: 0.9336


In [87]:
get_accuracy('4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing done
Vectorization done
Model: naive_bayes, Procedure: 4, Accuracy: 0.9123
Model: logistic_regression, Procedure: 4, Accuracy: 0.9409


In [88]:
get_accuracy('5')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing done
Vectorization done
Model: naive_bayes, Procedure: 5, Accuracy: 0.9124
Model: logistic_regression, Procedure: 5, Accuracy: 0.9409


In [89]:
get_accuracy('6')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing done
Vectorization done
Model: naive_bayes, Procedure: 6, Accuracy: 0.9149
Model: logistic_regression, Procedure: 6, Accuracy: 0.9413


In [94]:
result

[['naive_bayes', '1', 0.8999],
 ['logistic_regression', '1', 0.933],
 ['naive_bayes', '2', 0.9004],
 ['logistic_regression', '2', 0.9336],
 ['naive_bayes', '3', 0.9004],
 ['logistic_regression', '3', 0.9336],
 ['naive_bayes', '4', 0.9123],
 ['logistic_regression', '4', 0.9409],
 ['naive_bayes', '5', 0.9124],
 ['logistic_regression', '5', 0.9409],
 ['naive_bayes', '6', 0.9149],
 ['logistic_regression', '6', 0.9413]]