<a href="https://colab.research.google.com/github/Reichidad/Machine-Learning-2020-Spring-Class/blob/assignment11/assignment11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification using neural networks
20145822 김영현


## Decompress movie_review.zip

In [4]:
from zipfile import ZipFile
file_name = "/content/drive/My Drive/Colab Notebooks/data11/movie_review.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Decompress Done')

Decompress Done


## Install CuPy

In [2]:
!pip install cupy-cuda101



## Codes for Data Preprocessing

In [9]:
import numpy as np
import cupy as cp
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
nltk.download('wordnet')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split


review_data = load_files(r"movie_review")
X, y = review_data.data, review_data.target

documents = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2002 2002


## Codes for Training & Testing

In [None]:
num_train = len(X_train)
num_test = len(X_test)

# list for store all iterations
train_loss_list = []
train_accr_list = []
test_loss_list = []
test_accr_list = []

# theta initialization with normal distribution N(0, 1)
theta_u = cp.random.randn(196, 785)
theta_v = cp.random.randn(49, 197)
theta_w = cp.random.randn(10,50)
# learning values
alpha = 0.09
lambda_reg = 7.5


# fully connected calculation with bias(1)
def func_calc(theta_list, op_list):
  return cp.matmul(theta_list, np.insert(op_list, 0, 1))


# sigmoid calculation
def sigmoid(val):
  return 1/(1+cp.exp(-val))


# derivative of the sigmoid
def d_sigmoid(val):
  sig_now = sigmoid(val)
  return sig_now * (1 - sig_now)


# objective function
def ob_func(labels, results):
  sum = 0
  for j in range(len(results)):
    sum += (-labels[j] * cp.log(results[j])) - ((1 - labels[j]) * cp.log(1 - results[j]))
  return sum/num + ob_func_reg()


# addition loss with regularization
def ob_func_reg():
  avg_u = cp.mean(theta_u**2)
  avg_v = cp.mean(theta_v**2)
  avg_w = cp.mean(theta_w**2)
  size = theta_u.size + theta_v.size + theta_w.size
  return lambda_reg * (avg_u + avg_v + avg_w) / (2 * size) 


# addition gradient decent with regularization
def g_d_reg(theta):
  return lambda_reg * theta / theta.size


# main function for 1 iteration
def train_once():
  global theta_u, theta_v, theta_w
  # --------------------------
  # training code
  # --------------------------
  # data storage for training
  result_set = cp.empty((num_train))
  accr = 0
  theta_u_next = np.zeros((196, 1501))
  theta_v_next = np.zeros((49, 197))
  theta_w_next = np.zeros((1, 50))
  
  # training 
  for num in range(num_train):
    # forward-propagation
    x = X_train[num]
    y = func_calc(theta_u, x)
    y_sigmoid = sigmoid(y)
    z= func_calc(theta_v, y_sigmoid)
    z_sigmoid = sigmoid(z)
    h = func_calc(theta_w, z_sigmoid)
    h_sigmoid = sigmoid(h)
    result_set[num] = h_sigmoid

    # accuracy count
    temp_val = 0 if h_sigmoid < 0.5 else 1 
    if temp_val == y_train:
      accr += 1

    # gradient descent with back-propagation
    d_first = (1-y_train[num])/(1-h_sigmoid) - y_train[num]/h_sigmoid
    theta_w_next += cp.matmul(d_first, np.insert(z_sigmoid, 0, 1).reshape(1,50))
    
    d_second = np.matmul(d_first, theta_w)
    for i in range(1,50):
      d_second[i] *= d_sigmoid(z[i-1])
    theta_v_next += np.matmul(d_second[1:].reshape(49, 1), np.insert(y_sigmoid, 0, 1).reshape(1, 197))

    d_third = np.matmul(d_second[1:50], theta_v)
    for i in range(1,197):
      d_third[i] *= d_sigmoid(y[i-1])
    theta_u_next += np.matmul(d_third[1:].reshape(196,1), np.insert(x, 0, 1).reshape(1, 1501))

  # store train_loss & train_accuracy after training done
  train_loss = ob_func(y_train, result_set, num_train)
  train_loss_list.append(train_loss)
  accr = accr * 100 / num_train
  train_accr_list.append(accr)

  # --------------------------
  # testing code
  # --------------------------
  # data storage for testing
  test_result_set = np.empty(num_test)
  test_accr = 0

  # testing
  for num in range(num_test):
    # forward-propagation only in testing
    x = X_test[num]
    y = func_calc(theta_u, x)
    y_sigmoid = sigmoid(y)
    z= func_calc(theta_v, y_sigmoid)
    z_sigmoid = sigmoid(z)
    h = func_calc(theta_w, z_sigmoid)
    h_sigmoid = sigmoid(h)
    test_result_set[num] = h_sigmoid

    # accuracy count
    if np.argmax(h_sigmoid) == list_label_test[num]:
      test_accr += 1
  # store test_loss & test_accuracy after testing done
  test_loss = ob_func(y_test, test_result_set)
  test_loss_list.append(test_loss)
  test_accr = test_accr * 100 / num_test
  test_accr_list.append(test_accr)

  # update theta
  theta_u -= (alpha * (theta_u_next/num_train + g_d_reg(theta_u)))
  theta_v -= (alpha * (theta_v_next/num_train + g_d_reg(theta_v)))
  theta_w -= (alpha * (theta_w_next/num_train + g_d_reg(theta_w)))
# start iteration
iteration = 0
while iteration < 3500:
  train_once()
  print(iteration)
  print("train / test loss :", train_loss_list[-1], test_loss_list[-1])
  print("train / test accr :", train_accr_list[-1], test_accr_list[-1])
  iteration += 1