# This notebook is meant to be run on [Google Colab](https://colab.research.google.com)

- Your data should be in your Google Drive
- The data should be in csv format, first column the label, second column the text (no header)

In [19]:
import pandas as pd
pd.read_csv('../../data/formatted/formatted_cat1_text_sample.csv', names=['label', 'text'])

Unnamed: 0,cat1,text
0,143,عنوان ازمایشی متن ازمایشی


In [None]:
! pip install -U -q PyDrive
! git clone https://github.com/RasoulAM/bert_custom.git

In [7]:
import os
os.chdir("bert_custom")
os.environ['BERT_BASE_DIR'] = '/content/bert_custom/multi_cased_L-12_H-768_A-12'
os.environ['DATA_DIR'] = '/content/bert_custom/persian'
os.environ['TRAIN_DATA_PATH'] = 'path/to/train/data/in/drive.csv' # TODO path of your data in google drive
os.environ['TEST_DATA_PATH'] = 'path/to/test/data/in/drive.csv' # TODO path of your data in google drive
os.environ['OUTPUT_DIR'] = 'path/to/output/dir' # TODO path for the output in google drive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
from google.colab import drive as myDrive
myDrive.mount('/content/gdrive')

In [None]:
! git fetch cleanup
! git checkout cleanup
! mkdir persian
! wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
! unzip multi_cased_L-12_H-768_A-12.zip

# Copy all the data
! cp ../gdrive/My\ Drive/$TRAIN_DATA_PATH persian/train_file.csv
! cp ../gdrive/My\ Drive/$TRAIN_DATA_PATH persian/test_file.csv

# Copy part of the data
! head -1000 /content/gdrive/My\ Drive/$TRAIN_DATA_PATH > persian/train_file.csv
! head -200 /content/gdrive/My\ Drive/$TEST_DATA_PATH > persian/test_file.csv

In [None]:
from time import time
start_time = time()

! python run_classifier.py \
  --task_name=CLAS \
  --do_train=true \
  --do_eval=true \
  --do_predict=false \\
  --data_dir=$DATA_DIR \
  --vocab_file=$BERT_BASE_DIR/vocab.txt \
  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
  --max_seq_length=128 \
  --train_batch_size=32 \
  --learning_rate=5e-5 \
  --num_train_epochs=2 \
  --output_dir=/content/gdrive/My\ Drive/$OUTPUT_DIR \
  --do_lower_case=False \
  --save_checkpoints_steps=5000 \
  --num_classes=10 \
  --classes=1,2,67,79,12,143,38,125,151,191

with open("/content/gdrive/My Drive/{}/time_eval.txt".format(os.environ['OUTPUT_DIR']), "w") as time_file:
  time_file.write(str(time() - start_time))