In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
import json

In [2]:
urlbase = '/home/henri/Documents/Lighthouse-lab/Databases/w8-d4-db/'
files = ['sad_tweets.json','smile_tweets.json']

sadTweets = []
happyTweets = []
for file in files:
    if file == 'sad_tweets.json':    
        for line in open(urlbase + file, 'r'):
            sadTweets.append(json.loads(line))
    elif file == 'smile_tweets.json':
        for line in open(urlbase + file, 'r'):
            happyTweets.append(json.loads(line))

In [3]:
sadDf = pd.DataFrame(sadTweets)
sadDf['type'] = 0
happyDf = pd.DataFrame(happyTweets)
happyDf['type'] = 1

In [4]:
df = pd.concat([sadDf, happyDf]).reset_index(drop=True)

In [5]:
#sadTweets[1:2]

In [6]:
df[['text','type']].head(2)

Unnamed: 0,text,type
0,RT @bwintv: mj : the gift that fans gave us he...,0
1,RT @chaedimple: she’s so cute :( \n#채영 #트와이스 h...,0


In [7]:
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True,lowercase=True,strip_accents='ascii',stop_words=stopset)

In [8]:
y = df.type
x = vectorizer.fit_transform(df.text)

In [9]:
print(y.shape,x.shape)

(14296,) (14296, 20719)


In [10]:
 X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [11]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB()

In [12]:
roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

0.8885729539295577

## simple NN

In [27]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [28]:
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[5000], 
                  embedding_dim=[50],
                  maxlen=[100])

In [32]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from keras.preprocessing.sequence import pad_sequences
# Main settings
epochs = 20
embedding_dim = 50
maxlen = 100
output_file = 'data/output.txt'

# Run grid search for each source (yelp, amazon, imdb)
for source, frame in df.groupby('source'):
    print('Running grid search for data set :', source)
    sentences = df['text'].values
    y = df['type'].values

    # Train-test split
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    # Tokenize words
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(sentences_train)
    X_train = tokenizer.texts_to_sequences(sentences_train)
    X_test = tokenizer.texts_to_sequences(sentences_test)

    # Adding 1 because of reserved 0 index
    vocab_size = len(tokenizer.word_index) + 1

    # Pad sequences with zeros
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

    # Parameter grid for grid search
    param_grid = dict(num_filters=[32, 64, 128],
                      kernel_size=[3, 5, 7],
                      vocab_size=[vocab_size],
                      embedding_dim=[embedding_dim],
                      maxlen=[maxlen])
    model = KerasClassifier(build_fn=create_model,
                            epochs=epochs, batch_size=10,
                            verbose=False)
    grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                              cv=4, verbose=1, n_iter=5)
    grid_result = grid.fit(X_train, y_train)

    # Evaluate testing set
    test_accuracy = grid.score(X_test, y_test)

    # Save and evaluate results
    prompt = input(f'finished {source}; write to file and proceed? [y/n]')
    if prompt.lower() not in {'y', 'true', 'yes'}:
        break
    with open(output_file, 'a') as f:
        s = ('Running {} data set\nBest Accuracy : '
             '{:.4f}\n{}\nTest Accuracy : {:.4f}\n\n')
        output_string = s.format(
            source,
            grid_result.best_score_,
            grid_result.best_params_,
            test_accuracy)
        print(output_string)
        f.write(output_string)

Running grid search for data set : 
Fitting 4 folds for each of 5 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 88.3min finished


In [33]:
test_accuracy = grid.score(X_test, y_test)
test_accuracy

0.8125349879264832

In [34]:
grid.best_params_.items()

dict_items([('vocab_size', 18421), ('num_filters', 32), ('maxlen', 100), ('kernel_size', 3), ('embedding_dim', 50)])