In [None]:
# Use GPU option

# In Colab, go to Edit/Notebook Settings and choose the 'GPU' option before running this script

In [None]:
# Load data

# there are several ways to load data into Colab

# 1. Host your data to GitHub (up to 25MB) and use the url to the GitHub page
# e.g. df = pd.read_csv('https://raw.githubusercontent.com/junwang4/causal-language-use-in-science/master/data/pubmed_causal_language_use.csv') 

# 2. Host your data in your Google drive and then mount to your Google drive. You will be given an authorization code to finish the process
# e.g. the following code
# from google.colab import drive
# drive.mount('/drive')
# df = pd.read_csv('/drive/My Drive/train.tsv', sep='\t')

# 3. Upload your data to Colab Files. The uploaded file will be deleted when the session is disconnected. You will need to upload again after re-connect.
# e.g.   df = pd.read_csv('train.tsv', sep='\t')
# In this script we use method # 3

# We will use the Kaggle sentiment classification data
# https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('train.tsv', sep='\t')

In [None]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [None]:
# prepare train and test data
# since fine tuning a BERT model still requires a significant amount of time, 
# only 1000 training examples and 1000 test examples will be used for demo purpose.
# prior experiment shows LinearSVC's best accuracy (3-fold CV) is about 62-65% depending on vectorization options
# BERT should be able to outperform LinearSVC with far fewer training examples.

dff = df.sample(frac=1)
#train_size = int(0.01 * len(dff))
#df_train = dff[:train_size]
#df_test = dff[train_size:]
df_train = dff[:1000]
df_test = dff[-1000:]
print(df_train.shape)
print(df_test.shape)
X_train, y_train = df_train['Phrase'].values, df_train['Sentiment'].values
X_test, y_test = df_test['Phrase'].values, df_test['Sentiment'].values

(1000, 4)
(1000, 4)


In [None]:
# check the category distribution in the train and test set

import numpy as np
unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)))
unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)))


[[  0   1   2   3   4]
 [ 46 189 507 201  57]]
[[  0   1   2   3   4]
 [ 44 166 520 214  56]]


In [None]:
# build a LinearSVC model as a baseline comparison to the BERT model
# since LinearSVC is a linear model, we can print out its top features in each category to see whether the model learned something meaningful
# in this example, the top 10 features for the "very negative" category (category 0) is printed out
# you can see that some top features are not so negative
# with only 1000 training examples, LinearSVC achieved 0.529 accuracy on the 1000 test examples.

from sklearn.feature_extraction.text import CountVectorizer
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=2, stop_words='english')
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=1, max_iter=2000)
svm_clf.fit(X_train_vec,y_train)
feature_ranks = sorted(zip(svm_clf.coef_[0], unigram_count_vectorizer.get_feature_names()))
very_negative_10 = feature_ranks[-10:]
print("Very negative words")
for i in range(0, len(very_negative_10)):
    print(very_negative_10[i])
print()

Very negative words
(0.9979363730230754, 'holes')
(1.0162234195409297, 'sluggish')
(1.063475708785675, 'ways')
(1.0695641896643968, 'lower')
(1.1080579068105074, 'feeling')
(1.1503862403816845, 'mess')
(1.155753511998471, 'project')
(1.180017839779986, 'labored')
(1.3358762226684777, 'ridiculous')
(1.6833128586405508, 'worst')



In [None]:
# to save some time, we are using a hold-out test to compare the LinearSVC and BERT models
# you can also try cross validation
# LinearSVC test accuracy
X_test_vec = unigram_count_vectorizer.transform(X_test)
y_pred = svm_clf.predict(X_test_vec)
svm_clf.score(X_test_vec,y_test)

0.529

In [None]:
# you can also try cross validation on the train set
# the result should not be far from the hold-out test
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
svm_pipe = Pipeline([('vect', unigram_count_vectorizer),('svm',LinearSVC(dual=True, max_iter=2000))])
scores = cross_val_score(svm_pipe,X_train,y_train,cv=3)
print(sum(scores)/len(scores))

0.500998003992016


In [None]:
# print confusion matrix and classification report

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4])
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['0','1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names))

[[  4  10  20  10   0]
 [  2  22 126  13   3]
 [  5  25 451  34   5]
 [  1  16 144  47   6]
 [  1   4  25  21   5]]

              precision    recall  f1-score   support

           0       0.31      0.09      0.14        44
           1       0.29      0.13      0.18       166
           2       0.59      0.87      0.70       520
           3       0.38      0.22      0.28       214
           4       0.26      0.09      0.13        56

    accuracy                           0.53      1000
   macro avg       0.36      0.28      0.29      1000
weighted avg       0.46      0.53      0.47      1000



In [None]:
# LinearSVC error analysis
# print out errors to check if any common patterns for further model improvement

# print out very positive examples that were predicted as negative
# you can print out different types of errors 
err_cnt = 0
for i in range(0, len(y_test)):
  if (y_test[i]==4 and y_pred[i]==1):
    print(X_test[i])
    err_cnt = err_cnt+1
print("errors:", err_cnt)

is a stunning new young talent in one of Chabrol 's most intense psychological mysteries
An odd drama set in the world of lingerie models and bar dancers in the Midwest that held my interest precisely because it did n't try to .
The Hours represents two of those well spent
imbued with passion and attitude
errors: 4


In [1]:
# install BERT sklearn wrapper written by charles9n
# check out the github page for fine tuning options and usage
# https://github.com/charles9n/bert-sklearn

!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .

'git' is not recognized as an internal or external command,
operable program or batch file.
The system cannot find the path specified.


In [None]:
# fine tune a BERT base uncased model
# since this wrapper has included vectorization using word embedding, no need to vectorize like in LinearSVC
# first the pre-trained BERT model will be loaded in
# then the training starts. 90% examples will be used as training examples and the other 10% as validation (parameter tuning)
# default setting is 3 epoch. Each epoch takes in some training data
from bert_sklearn import BertClassifier
model = BertClassifier()         # text/text pair classification
print(model)
model.fit(X_train, y_train)

Building sklearn text classifier...
BertClassifier(bert_config_json=None, bert_model='bert-large-uncased',
               bert_vocab=None, do_lower_case=None, epochs=3, eval_batch_size=8,
               fp16=False, from_tf=False, gradient_accumulation_steps=1,
               ignore_label=None, label_list=None, learning_rate=2e-05,
               local_rank=-1, logfile='bert_sklearn.log', loss_scale=0,
               max_seq_length=128, num_mlp_hiddens=500, num_mlp_layers=0,
               random_state=42, restore_file=None, train_batch_size=32,
               use_cuda=True, validation_fraction=0.1, warmup_proportion=0.1)



  0%|          | 0/231508 [00:00<?, ?B/s][A
 23%|██▎       | 52224/231508 [00:00<00:00, 427682.02B/s][A
100%|██████████| 231508/231508 [00:00<00:00, 898030.22B/s]


Loading bert-large-uncased model...



  0%|          | 0/1344997306 [00:00<?, ?B/s][A
  0%|          | 34816/1344997306 [00:00<1:19:32, 281818.74B/s][A
  0%|          | 243712/1344997306 [00:00<59:45, 375034.45B/s] [A
  0%|          | 1027072/1344997306 [00:00<42:54, 522125.09B/s][A
  0%|          | 4127744/1344997306 [00:00<30:14, 739113.89B/s][A
  1%|          | 8125440/1344997306 [00:00<21:19, 1045246.20B/s][A
  1%|          | 12499968/1344997306 [00:00<15:04, 1473833.51B/s][A
  1%|▏         | 16989184/1344997306 [00:00<10:42, 2068189.03B/s][A
  2%|▏         | 21593088/1344997306 [00:01<07:38, 2883745.31B/s][A
  2%|▏         | 24446976/1344997306 [00:01<05:38, 3899554.14B/s][A
  2%|▏         | 28874752/1344997306 [00:01<04:05, 5368168.22B/s][A
  2%|▏         | 32061440/1344997306 [00:01<03:07, 7009621.70B/s][A
  3%|▎         | 36502528/1344997306 [00:01<02:19, 9379000.91B/s][A
  3%|▎         | 40617984/1344997306 [00:01<01:46, 12206085.39B/s][A
  3%|▎         | 45151232/1344997306 [00:01<01:23, 15633238.7

In [None]:
model.save('bert-sentiment.model')

In [None]:
model.score(X_test, y_test)

HBox(children=(FloatProgress(value=0.0, description='Testing', max=125.0, style=ProgressStyle(description_widt…



Loss: 1.1000, Accuracy: 60.60%


60.6

In [None]:
y_pred = model.predict(X_test)

HBox(children=(FloatProgress(value=0.0, description='Predicting', max=125.0, style=ProgressStyle(description_w…




In [None]:
from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4])
print(cm)
print()

from sklearn.metrics import classification_report
target_names = ['0','1','2','3','4']
print(classification_report(y_test, y_pred, target_names=target_names))

[[  4  10  20  10   0]
 [  2  22 126  13   3]
 [  5  25 451  34   5]
 [  1  16 144  47   6]
 [  1   4  25  21   5]]

              precision    recall  f1-score   support

           0       0.31      0.09      0.14        44
           1       0.29      0.13      0.18       166
           2       0.59      0.87      0.70       520
           3       0.38      0.22      0.28       214
           4       0.26      0.09      0.13        56

    accuracy                           0.53      1000
   macro avg       0.36      0.28      0.29      1000
weighted avg       0.46      0.53      0.47      1000



In [None]:
# BERT error analysis
err_cnt = 0
for i in range(0, len(y_test)):
  if (y_test[i]==4 and y_pred[i]==1):
    print(X_test[i])
    err_cnt = err_cnt+1
print("errors:", err_cnt)

since its poignancy hooks us completely
errors: 1


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model,X_train,y_train,cv=3)
print(sum(scores)/len(scores))

59.39742137346928
