<a href="https://colab.research.google.com/github/RyanTokManMokMTM/NLP_Final/blob/main/nlp_getting_started.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple Kaggle Compitition with NLP
## Natural Language Processing with Disaster Tweets

### load package

In [4]:
#install package
!pip install transformers



In [5]:
#package import here
from google.colab import drive
import pandas as pd
import nltk.corpus
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer,BertConfig,TFBertModel
import numpy as np
import keras

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### loading the data from csv

In [6]:
def dataLoader(csvPath):
  return pd.read_csv(csvPath,encoding="utf-8")

#### Define a function to remove the html url etc

In [7]:
#define a clean text function
def clean_texts(text):
  text = text.lower()
  #clean all not a-zA-z0-9 text
  textRe = re.compile("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?")
  return textRe.sub("",text)

#### need to clean all stop word with cleaning words list
#### stopword list with NL Toolkit stopword.word(english)

In [8]:
#clean all stop words in each sentance
def clean_stop_word(cleanTxt):
  return " ".join([word for word in cleanTxt.split() if not word in stopwordsList])

### Basic SGDClassifer(0.77)

In [9]:
def classifiedWithSGD(feature_x,label_y):
  #init the pieple for sgd
  sgd = Pipeline([
      ('vect', CountVectorizer()),
      ('tfidf',  TfidfTransformer()),
      ('nb', SGDClassifier()),
    ])
    #fit the data to model
  model = sgd.fit(feature_x, label_y)
  return model

### Tokenization
#### make the word as a vector of each sentence

In [10]:
#init CountVectorizer  
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,stop_words=None)
def tokenizationAndFitModel(featureSet):
  #fit the model with set
  return vectorizer.fit_transform(featureSet).toarray()

def tokenizationAndTranform(featureSet):
  #fit the model with set
  return vectorizer.transform(featureSet)


### Trying on Bert Model(0.57033),may need some more epoch and batch size

In [11]:
def generateModelArray(featureData,tokenizer,maxLength):
  ids = []
  masks = []
  tokens = []
  for text in featureData:
    token = tokenizer.encode(text,max_length=maxLength)
    padding_size = maxLength - len(token)
    ids.append(token + [0]*padding_size)
    masks.append([1]*len(token) + [0]*padding_size) #only token is 1 other padding to 0
    tokens.append([0]*maxLength) #all 0
  ids = np.array(ids)
  masks = np.array(masks)
  tokens = np.array(tokens)
  return ids, masks,tokens

In [35]:
BATCH_SIZE = 32
MAX_SIZE = 128
EPOCHS = 5

def createBertModel(featureData,labelData):
  tokenizer = BertTokenizer.from_pretrained("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt")
  config = BertConfig.from_pretrained("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json")
  bert_model = TFBertModel.from_pretrained("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",config=config)

  train_ids, train_mask,train_token = generateModelArray(featureData,tokenizer,MAX_SIZE)

  #train Label to np array
  train_label = np.array(labelData)


  #model argument  and 3 input
  input_ids = keras.layers.Input(shape=(MAX_SIZE,),dtype='int32')
  attension_mask = keras.layers.Input(shape=(MAX_SIZE,),dtype='int32')
  token_type_ids = keras.layers.Input(shape=(MAX_SIZE,),dtype='int32')

  Bertmodel  = bert_model([input_ids,attension_mask,token_type_ids])
  lastStateOutPut = Bertmodel.last_hidden_state
  poolerOutput =  Bertmodel.pooler_output

  #model output layer
  denseOutput = keras.layers.Dense(units=1,activation="sigmoid")(poolerOutput)

  #creating model
  model = keras.models.Model(inputs=[input_ids,attension_mask,token_type_ids],outputs = denseOutput)

  #model compile
  model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])

  #training model
  #data set for train and test
  (x_train_ids,x_test_index,x_train_mask,x_test_mask,x_train_token,x_test_token,y_train,y_test)= train_test_split(train_ids,train_mask,train_token,train_label,test_size = 0.1,stratify=train_label, random_state=0)
  
  print(x_train_ids)
  #early_stopping
  training_early_stopping = keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True)

  #fit model
  model.fit([x_train_ids,x_train_mask,x_train_token],y_train,epochs=EPOCHS,batch_size=BATCH_SIZE,validation_data=([x_test_index,x_test_mask,x_test_token],y_test),callbacks=[training_early_stopping])

  return model,tokenizer


In [13]:
# #predict the model
# y_predicted = model.predict([test_ids,test_mask,test_token],batch_size=32,verbose=1)

In [14]:
# y_predicted = y_predicted.ravel()

In [15]:
# y_predicted = (y_predicted >= 0.5).astype(int)

In [16]:
# def outputSubmission(submissionFile,outputCsv):
#   predict_submission = pd.read_csv("sample_submission.csv")
#   predict_submission["target"] = predict
#   predict_submission.to_csv("nlp_submission_sgd.csv",index = False)

# Main process is running here

In [17]:
def process_SGD(featureData,lableData,predictFeature):
  x_train,x_test,y_train,y_test = train_test_split(featureData,lableData,random_state = 0)
  model = classifiedWithSGD(x_train,y_train)

  #predict test data set
  predict = model.predict(x_test)
  #show predict report
  print(classification_report(y_test,predict))

  predictResult = model.predict(predictFeature)
  return predictResult

In [31]:
def process_RFC(featureData,lableData,predictFeature):
  x_train,x_test,y_train,y_test = train_test_split(featureData,lableData,random_state = 0)
  #need to tokenization word to vector
  feature_vector = tokenizationAndFitModel(x_train)
  
  #create the model
  RFC = RandomForestClassifier(n_estimators=100)

  #fit moodel with tokenization vector
  RFC = RFC.fit(feature_vector,y_train)
  test_vector = tokenizationAndTranform(x_test)
  # #predict 
  RFCPredict = RFC.predict(test_vector)

  #print predict report
  print(classification_report(y_test,RFCPredict))

  #predict the result with testSet
  test_setVector = tokenizationAndTranform(testSetFeature)
  RFCPredict = RFC.predict(test_setVector)
  return RFCPredict

In [36]:
def process_bert(featureData, labelData,predictFeature):
  model,tokenizer = createBertModel(featureData, labelData)

  #predict with model
  predict_ids,predict_masks,predict_tokens = generateModelArray(predictFeature,tokenizer)
  preicted_result = model.predict([predict_ids,predict_masks,predict_tokens],batch_size=64,verbose=1)
  preicted_result = preicted_result.ravel()
  preicted_result = (preicted_result >= 0.5).astype(int)
  return preicted_result

In [None]:
#main
if  __name__ == '__main__':
  #login to google drive
  drive.mount('/content/gdrive')

  train_data = dataLoader("./gdrive/MyDrive/NLP_dataset/train.csv")
  test_data = dataLoader("./gdrive/MyDrive/NLP_dataset/test.csv")
  submission_data = dataLoader("./gdrive/MyDrive/NLP_dataset/sample_submission.csv")

  #drop out the data we don't need it
  train_data.drop(['id', 'keyword', 'location'],axis=1)

  #split feature and label
  features = train_data["text"].tolist()
  label = train_data["target"].tolist()
  test_features = test_data["text"].tolist()

  #all clean training data set(stop words not clean yet)
  clean_feature_data = [] #for training
  clean_feature_test_data = [] #for testing
  for txt in features:
    clean_feature_data.append(clean_texts(txt))

  for txt in test_features:
    clean_feature_test_data.append(clean_texts(txt))
  
  #get stop word english list
  stopwordsList = stopwords.words("english")

  #remove all stopword from sentance
  all_clean_text = []
  all_clean_testSet_text = []
  for sentance in clean_feature_data:
    all_clean_text.append(clean_stop_word(sentance))
  for sentance in clean_feature_test_data:
    all_clean_testSet_text.append(clean_stop_word(sentance))

  #ged_result = process_SGD(all_clean_text,label,all_clean_testSet_text)
  #rfc_result = process_RFC(all_clean_text,label,all_clean_testSet_text)
  bert_reuslt = process_bert(all_clean_text,label,all_clean_testSet_text)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).




Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Truncation was not explicitly activ

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
[[  101 28997  4971 ...     0     0     0]
 [  101  5175 11629 ...     0     0     0]
 [  101  5926 20645 ...     0     0     0]
 ...
 [  101  4584  5343 ...     0     0     0]
 [  101  4575  2987 ...     0     0     0]
 [  101  2630  4330 ...     0     0     0]]
Epoch 1/5
  2/215 [..............................] - ETA: 2:26:49 - loss: 0.6737 - accurac