# Midterm Exam

## Import Library

In [8]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Preprocessing

In [9]:
all_class = ["CE", "ENV", "BME", "PE", "METAL", "ME", "EE", "CPE", "OPTIC", "NANO", "CHE", "MATENG", "AGRI", "EDU", "IE", "SAFETY", "MATH", "MATSCI" ]

In [10]:
import json

with open('train_for_student.json') as json_file:
  data = json.load(json_file)

In [20]:
df = pd.DataFrame(columns=["ID", "Title", "Abstract"] + all_class)

In [31]:
for idx, (key, value) in enumerate(data.items()):
  row = {
    "ID": key,
    "Title": value["Title"],
    "Abstract": value["Abstract"]
  }

  for cls in all_class:
    row[cls] = 1 if cls in value["Classes"] else 0
  
  df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

df

Unnamed: 0,ID,Title,Abstract,CE,ENV,BME,PE,METAL,ME,EE,...,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI
0,001,Activated carbon derived from bacterial cellul...,© 2019 Elsevier B.V.Activated carbon derived f...,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
1,002,The algorithm of static hand gesture recogniti...,© Springer International Publishing AG 2018.Te...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,003,Alternative Redundant Residue Number System Co...,© 2018 IEEE.Residue number system (RNS) is a n...,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,004,Comparative study of wax inhibitor performance...,© Published under licence by IOP Publishing Lt...,0,0,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,005,Undrained lower bound solutions for end bearin...,"© 2019 John Wiley & Sons, Ltd.The undrained be...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,450,A portable USB-controlled potentiostat for pap...,© 2018 IEEEThis paper presents a portable and ...,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
904,451,Literature reviews on applying artificial inte...,Copyright © 2019 for this paper by its authors...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
905,452,A multi-parameterized water quality prediction...,© 2019 The authors and IOS Press. All rights r...,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
906,453,Semantic Segmentation on Medium-Resolution Sat...,© 2018 IEEE.Semantic Segmentation is a fundame...,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0


In [33]:
import nltk
# Uncomment to download "stopwords"
nltk.download("stopwords")
from nltk.corpus import stopwords

def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "n't" to "not"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"n't", " not", s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    return s

[nltk_data] Downloading package stopwords to /Users/titor/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [34]:
df_clean = df.copy()
df_clean["Title"] = df["Title"].apply(text_preprocessing)
df_clean["Abstract"] = df["Abstract"].apply(text_preprocessing)

df_clean

Unnamed: 0,ID,Title,Abstract,CE,ENV,BME,PE,METAL,ME,EE,...,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI
0,001,activated carbon derived bacterial cellulose u...,2019 elsevier b v activated carbon derived bac...,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
1,002,algorithm static hand gesture recognition usin...,springer international publishing ag 2018 tech...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,003,alternative redundant residue number system co...,2018 ieee residue number system rns number rep...,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,004,comparative study wax inhibitor performance po...,published licence iop publishing ltd petroleum...,0,0,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,005,undrained lower bound solutions end bearing ca...,2019 john wiley sons ltd undrained bearing cap...,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,450,portable usb controlled potentiostat paper bas...,2018 ieeethis paper presents portable inexpens...,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
904,451,literature reviews applying artificial intelli...,copyright 2019 paper authors use permitted cre...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
905,452,multi parameterized water quality prediction m...,2019 authors ios press rights reserved paper p...,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
906,453,semantic segmentation medium resolution satell...,2018 ieee semantic segmentation fundamental ta...,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0


In [37]:
X = df_clean['Title'] + ' ' + df_clean['Abstract']
y = df_clean.drop(['ID', 'Title', 'Abstract'], axis=1).values

print(f"X: ${X}")
print(f"y: ${y}")

X: $0      activated carbon derived bacterial cellulose u...
1      algorithm static hand gesture recognition usin...
2      alternative redundant residue number system co...
3      comparative study wax inhibitor performance po...
4      undrained lower bound solutions end bearing ca...
                             ...                        
903    portable usb controlled potentiostat paper bas...
904    literature reviews applying artificial intelli...
905    multi parameterized water quality prediction m...
906    semantic segmentation medium resolution satell...
907    reducing defects pillar stamping part automoti...
Length: 908, dtype: object
y: $[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]


In [38]:
X_tokenized = [x.split() for x in X]

In [39]:
from collections import Counter

vocab = Counter(word for sentence in X_tokenized for word in sentence)
vocab_size = len(vocab)
print(vocab_size)

8622


In [41]:
word_to_idx = {word: i+1 for i, (word, _) in enumerate(vocab.items())}
print(word_to_idx)



In [42]:
X_encoded = [[word_to_idx[word] for word in sentence] for sentence in X_tokenized]
print(X_encoded)

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 16, 17, 25, 7, 26, 27, 9, 11, 16, 17, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 42, 46, 47, 21, 48, 49, 50, 51, 52, 53, 54, 55, 7, 35, 56, 57, 9, 10, 58, 30, 59, 32, 18, 16, 17, 60, 50, 23, 21, 48, 24, 16, 17, 61, 9, 10, 62, 11, 63, 62, 64, 63, 65, 66, 67, 68, 9, 10, 69, 28, 70, 32, 71, 72, 73, 74, 75, 76, 58, 77, 70, 59, 32, 78, 24, 16, 17, 7, 64, 79, 71, 80, 81, 82, 83, 24, 16, 17, 84, 85, 86, 87, 88, 7, 11, 89, 90, 9, 27], [91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 108, 112, 113, 114, 115, 116, 117, 36, 118, 119, 105, 120, 121, 93, 94, 122, 123, 124, 108, 125, 112, 126, 127, 128, 129, 130, 131, 132, 133, 134, 93, 94, 95, 135, 127, 96, 136, 137, 138, 139, 140, 141, 97, 98, 99, 142, 98, 93, 94, 95, 115, 143, 144, 93, 145, 146, 147, 148, 149, 93, 94, 121, 150, 151, 137, 99, 149, 144, 92, 93, 145], [15

In [44]:
max_length = max(len(sentence) for sentence in X_encoded)
X_padded = np.array([sentence + [0] * (max_length - len(sentence)) for sentence in X_encoded])

## Train Test Split

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

## Making model

In [56]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class TextClassifier(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
    super(TextClassifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.fc = nn.Linear(embed_dim * max_length, num_class)

  def forward(self, x):
    embed = self.embedding(x).view(x.size(0), -1)
    output = self.fc(embed)
    return torch.sigmoid(output)

In [57]:
embedding_dim = 50
output_dim = y.shape[1]
model = TextClassifier(vocab_size+1, embedding_dim, output_dim)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

ImportError: cannot import name 'is_sparse_any' from 'torch._subclasses.meta_utils' (/opt/homebrew/Caskroom/miniforge/base/envs/datasci/lib/python3.9/site-packages/torch/_subclasses/meta_utils.py)

## Train model

In [None]:
epochs = 5
batch_size = 1
dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_train).long(), torch.from_numpy(y_train).float())
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
  for x, y in tqdm(loader):
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
  
  print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")