In [13]:
# COMP90051 Project 1 source code
# For Team 192

import numpy as np
import pandas as pd

from scipy import stats
from sklearn import preprocessing

# self-made external scripts
import ext_scipy

In [14]:
config = {
    'ROW_USE': 'tweet',
    'RANDOM_STATE': 1569198,
    'COMMENT': 'On new data set'
}


In [None]:
df_X_train = pd.read_csv('cleaned_train.csv')
df_X_test = pd.read_csv('cleaned_test.csv')

all_unique = df_X_train['label'].unique()

df_X_train = df_X_train[df_X_train[config['ROW_USE']].notna()]
X_train, y_train = df_X_train[config['ROW_USE']].to_numpy(), df_X_train['label'].to_numpy()

df_X_test[config['ROW_USE']][df_X_test[config['ROW_USE']].isna()] = df_X_test["tweet"][df_X_test[config['ROW_USE']].isna()]
X_test = df_X_test[config['ROW_USE']]

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, Normalizer, FunctionTransformer, StandardScaler
from sklearn.kernel_approximation import Nystroem, RBFSampler

vectorizer_params = {
    'strip_accents': 'unicode',
    #token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b',
    'analyzer': 'word', #'max_df': 3.1e-6,
    'lowercase': True, 'stop_words': 'english', 'n_features': 2**15, 'alternate_sign': False, # 'norm': None
}

vectorizer_params_2 = {
    'strip_accents': 'unicode',
    #token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b',
    'analyzer': 'word', 'max_features': 32768, #
    'lowercase': True, 'stop_words': 'english'
}

lda_params = {
    'n_components': 1500,
    'learning_method': 'online',
    'batch_size': 2000,
    'random_state': config['RANDOM_STATE'], 'n_jobs': 1, 'verbose': 2
}


nystroem_params = {
    'random_state': config['RANDOM_STATE'],
    'n_components': 2**5,
    'gamma': 1
}

In [17]:
#import sklearn.random_projection as rp

# Vectorize the text
fp_vectorize = Pipeline([
    ('vc', CountVectorizer(**vectorizer_params_2)),
    #('tfidf', TfidfTransformer())
], verbose=True)

# put it altogether
fp_all = fp_vectorize # make_pipeline(fp_vectorize, fp_dim_reduce)

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# using library from https://github.com/cjhutto/vaderSentiment

# sentiment
def get_vader_sentiment(X, **args):
    analyzer = SentimentIntensityAnalyzer()
    return np.array([analyzer.polarity_scores(tw)['compound'] for tw in args['tweets']]).reshape(-1, 1)

# length of tweet
def get_text_length(X, **args):
    return np.array([len(tw) for tw in args['tweets']]).reshape(-1, 1)


new_pipe = Pipeline([
    ('feats', FeatureUnion([
        ('main', Pipeline([
            #('vectorizer', HashingVectorizer(**vectorizer_params)),
            ('tfidf', TfidfVectorizer(**vectorizer_params_2)),
            #('svd', TruncatedSVD(n_components=300, random_state=config['RANDOM_STATE']))
        ])),
        ('length', Pipeline([
            ('lenf', FunctionTransformer(get_text_length, validate=False)),
            ('lennorm', StandardScaler()),
        ])),
        ('sent', Pipeline([
            ('sentf', FunctionTransformer(get_vader_sentiment, validate=False)),
            ('sentnorm', StandardScaler()),
        ]))
    ]))
])

In [7]:
fp_all = new_pipe

X_tweets = df_X_train['tweet'].to_numpy()
fp_all.set_params(feats__length__lenf__kw_args={'tweets': X_tweets}, feats__sent__sentf__kw_args={'tweets': X_tweets})
X_train_transformed = fp_all.fit_transform(X_train)

In [None]:
#fp_all.fit(np.concatenate((X_train, X_test)))
X_train_transformed = fp_all.fit_transform(X_train)

In [11]:
X_tweets = df_X_test['tweet'].to_numpy()
fp_all.set_params(feats__length__lenf__kw_args={'tweets': X_tweets}, feats__sent__sentf__kw_args={'tweets': X_tweets})
X_test_cv = fp_all.transform(X_test)

In [12]:
np.save(open('np_train_svd.npy', 'wb'), X_train_transformed)
np.save(open('np_test_svd.npy', 'wb'), X_test_cv)

#X_train_transformed = np.load(open('np_train_svd.npy', 'rb'))
#X_test_cv = np.load(open('np_test_svd.npy', 'rb'))

In [19]:
X_train_transformed.shape

(328932, 32768)

In [20]:
from sklearn.model_selection import train_test_split

accs = []
X_train_2, X_vald, y_train_2, y_vald = train_test_split(X_train_transformed, y_train, random_state=config['RANDOM_STATE'], test_size=0.035)

In [21]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split

# Perceptron: loss="perceptron", eta0=1, learning_rate="constant", penalty=None

inc = ext_scipy.IncrementalLearn(SGDClassifier(n_jobs=-1, alpha=1e-6, warm_start=True, loss='hinge', penalty='l2'), batch_amount=30000)
#inc = ext_scipy.IncrementalLearn(SGDClassifier(n_jobs=-1, warm_start=True, loss="perceptron", eta0=1, learning_rate="constant", penalty=None), batch_amount=30000)
#inc = ext_scipy.IncrementalLearn(MultinomialNB(alpha=0.01), batch_amount=8000)
#inc = ext_scipy.IncrementalLearn(SGDClassifier(n_jobs=-1, warm_start=True, loss="log", alpha=1e-8), batch_amount=30000)

In [22]:
import time
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

def train_model_incremental(inc, epochs, X_train_use, y_train_use, X_vald, y_vald, accuracy_list):
    for i in range(epochs):
        time_start = time.time()
        X, y = shuffle(X_train_use, y_train_use)
        inc.fit(X, y, extra_text='Progress: ' + str(accuracy_list), classes=all_unique)
        
        if X_vald is not None and y_vald is not None:
            accuracy_list.append(accuracy_score(y_vald, ext_scipy.predict_batch(X_vald, inc.model.predict, batch_amount=8000)))
    return inc
        
def get_accuracy(model_pred_f, X_train_use, y_train_use, prop=0.08, batch_amount=5000):
    sample_tr = np.random.choice(X_train_use.shape[0], int(X_train_use.shape[0] * prop))
    preds = ext_scipy.predict_batch(X_train_use[sample_tr], model_pred_f, batch_amount=batch_amount)

    return accuracy_score(y_train_use[sample_tr], preds)


#### NOTE:
# Perhaps use metrics.classification_report for more detailed report?

In [14]:
vald_preds = ext_scipy.predict_batch(X_vald, inc.model.predict, batch_amount=8000)
# df_X_vald.iloc[np.nonzero(y_vald == vald_preds)]

'8000 / 12300 (0.6504, 25.559 s)'

In [18]:
accuracy_score(vald_preds, y_vald)

0.8043089430894309

In [None]:
from sklearn.metrics import recall_score
recall_score(y_vald, vald_preds, average='micro') # of all actual positives, which ones were correct

from sklearn.metrics import precision_score
precision_score(y_vald, vald_preds, average='micro') # of all predicted positive, which ones were correct

from sklearn.metrics import f1_score
f1_score(y_vald, vald_preds, average='micro')

from sklearn.metrics import classification_report
print(classification_report(y_vald, vald_preds))

In [23]:
inc = train_model_incremental(inc, 5, X_train_2, y_train_2, X_vald, y_vald, accs)

#train_model_incremental(inc, 1, X_train_transformed, y_train, None, None, [])

'8000 / 11513 (0.6949, 32.717 s)'

## Saving results

In [None]:
import pickle

f_name = 'naive_bayes_results.pkl'

res = pickle.load(open(f_name, 'rb'))
#res = []
res.append({'model_name': type(inc.model).__name__, \
            'model_params': inc.model.get_params(), \
            'pipeline_params': fp_all.get_params()['steps'], \
            'extra_params': inc.params_, \
            'other': config, \
            'accs': accs,
            'train_acc': train_acc})

#res[-1]['test_val'] = 0.20308

pickle.dump(res, open(f_name, 'wb'))

In [13]:
import json
vm_res = json.load(open('svm_results.json', 'r'))

In [None]:
from sklearn.externals import joblib

#joblib.dump(inc, 'svm_final.pkl') 
inc = joblib.load('svm_final.pkl')

In [18]:
X_test_final = fp_all.transform(X_test)

predictions = ext_scipy.predict_batch(X_test_final, inc.model.predict, \
                                      batch_amount=10000, sparse_expand=False)
#predictions = model_use.predict(X_test_cv)

'30000 / 35437 (0.8466, 19.978 s)'

In [19]:
ext_scipy.save_predictions(predictions)

In [16]:
X_test_final.shape

(35437, 80000)