#Description

This source code makes use of Google Colab and Google Drive. We first attempted to create a logistic regression model for text classification which gave us the highest overall accuracy. Afterwards, we attempted to come up with a visual model using Pytorch but only managed to obtain a very low accuracy of 0.2. Finally, we decided to classify the test dataset into the three broad categories of mobile, beauty and fashion first before making use of three separate text classification model for each of the categories to further classify them into the sub-categories. However, it was slightly less accurate than the linear text classification model.

In [0]:
from google.colab import drive

drive.mount('/content/gdrive/')

#Text Model

In [0]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import nltk

In [0]:
trainDF = pd.read_csv('C:/Users/alyta/Downloads/year 2/year 2 sem 2/NDSC/dataset/train.csv')
col = ['title', 'Category']
trainDF = trainDF[col]

# trainDFF = trainDF[]
print(trainDF.shape)

In [0]:
#read test data
test = pd.read_csv('C:/Users/alyta/Downloads/year 2/year 2 sem 2/NDSC/dataset/test.csv')
print(test.shape)
test_x=test["title"]

#prediction
results = pd.DataFrame(columns = ["itemid", "Category"])

In [0]:
import re
from nltk.tokenize.toktok import ToktokTokenizer
nltk.download('stopwords')
tokenizer = ToktokTokenizer()

stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

#use porter stemmer
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
  
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

#remove digits
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def normalize_corpus(corpus):
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:   
        #if text_lower_case:
        doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        #if text_lemmatization:
         #   doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        #if special_char_removal:
            # insert spaces between special characters to isolate them    
        doc=simple_stemmer(doc)
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc, remove_digits=True)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        #if stopword_removal:
        doc = remove_stopwords(doc, is_lower_case=True)
        normalized_corpus.append(doc)
    return normalized_corpus
  
trainDF['title']=normalize_corpus(trainDF['title'])
test['title']=normalize_corpus(test['title'])

In [0]:
#stratified sampling
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, random_state=25, test_size=0.1)


for train_index, valid_index in sss.split(X=trainDF.index.values, y=trainDF['Category'].values):
    train_y, valid_y = trainDF.iloc[train_index]['Category'], trainDF.iloc[valid_index]['Category']
    train_x, valid_x = trainDF.iloc[train_index]['title'], trainDF.iloc[valid_index]['title']

train_y= train_y.reset_index(drop=True)
train_x= train_x.reset_index(drop=True)
valid_y= valid_y.reset_index(drop=True)
valid_x= valid_x.reset_index(drop=True)

print(valid_x.shape)
print(train_x.shape)

In [0]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['title'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

xtest_tfidf = tfidf_vect.transform(test_x)

In [0]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=1000)
tfidf_vect_ngram.fit(trainDF['title'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

xtest_tfidf = tfidf_vect_ngram.transform(test_x)

In [0]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['title'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [0]:
def train_model_textual(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)  
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)
  

def score_model(classifier, feature_vector_train, label, feature_vector_test,results):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    #predict one by one  and write label to test set
    predictions = classifier.predict(feature_vector_test)
    
    return predictions
  
def generate_result(prediction,results):
    for i in range (0,len(test_x)):
      results= results.append ({'itemid':test['itemid'][i], 'Category': predictions[i]}, ignore_index=True)
    
    return results

In [0]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model_textual(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

In [0]:
#train on linear model
predictions= score_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf,results)

In [0]:
result=generate_result(predictions,results)

In [0]:
result.to_csv('C:/Users/alyta/Downloads/year 2/year 2 sem 2/NDSC/dataset/predictions_linear.csv', index = False)

#Visual Model

In [0]:
%%capture
!unzip /content/gdrive/'My Drive'/Data/mobile_image.zip

In [0]:
%%capture
!unzip /content/gdrive/'My Drive'/Data/fashion_image.zip

In [0]:
%%capture
!unzip /content/gdrive/'My Drive'/Data/beauty_image.zip

In [0]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

from PIL import Image
from mpl_toolkits.axes_grid1 import ImageGrid

%matplotlib inline

In [0]:
import torch
import torchvision
import PIL
from torch.utils.data import DataLoader, Dataset

from torch.autograd import Variable
import torchvision.models as models
import torchvision.transforms as transforms

from torch import nn

In [0]:
from imgaug import augmenters as iaa
import imgaug as ia

In [0]:
!pip install tensorboardX

In [0]:
import time
import copy
import math
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import log_loss
from tensorboardX import SummaryWriter

###Define necessary functions

In [0]:
#This function specifies the transformation performed on the validation set
def get_test_time_transform(scale_size, crop_size):
	test_time_transform = torchvision.transforms.Compose([
		torchvision.transforms.Resize(scale_size),
		torchvision.transforms.CenterCrop(crop_size),
		torchvision.transforms.ToTensor(),
		torchvision.transforms.Normalize(
			mean=[0.485, 0.456, 0.406],
			std=[0.229, 0.224, 0.225]
		)
	])

	return test_time_transform

In [0]:
#This function specifies the transformation performed on the train set
def get_train_time_transform_simple(scale_size, crop_size):
	train_time_transform = torchvision.transforms.Compose([
		torchvision.transforms.Resize(scale_size),
		torchvision.transforms.RandomCrop(crop_size),
		torchvision.transforms.RandomHorizontalFlip(),
		torchvision.transforms.ToTensor(),
		torchvision.transforms.Normalize(
			mean=[0.485, 0.456, 0.406],
			std=[0.229, 0.224, 0.225]
		)
	])

	return train_time_transform

In [0]:
def recover_image(im, writer=False):
	mean = np.array([0.485, 0.456, 0.406])
	std = np.array([0.229, 0.224, 0.225])

	im = im.numpy().transpose((1, 2, 0))
	im = std * im + mean
	im = np.clip(im, 0, 1)

	if writer:
		im = im.transpose((2, 0, 1))

	return im

In [0]:
#This function trains the visual model
def train_model_visual(model, device, dataloaders, criterion, optimizer, scheduler=None, num_epochs=25, in_notebook=True):
  since = time.time()
  # initialize a progress bar
  tqdm_func = tqdm_notebook if in_notebook else tqdm
  writer = SummaryWriter()

  # initialize the best model
  best_model_wts = copy.deepcopy(model.state_dict())
  best_acc = 0.0
  best_log_loss = math.inf

  # initialize a list for storing the performance history
  train_performance_history = []
  val_performance_history = []

  for epoch in tqdm_func(range(num_epochs)):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    # Each epoch has a training and validation phase
    for phase in ['train', 'val']:
      if phase == 'train':
        # set model to training mode
        if scheduler is not None:
          scheduler.step()
        model.train()
        num_batch_per_epoch = int(
          np.ceil(len(dataloaders['train'].dataset) * 1.0 / dataloaders['train'].batch_size))
      else:
        # set model to evaluation mode
        model.eval()
        num_batch_per_epoch = int(
          np.ceil(len(dataloaders['val'].dataset) * 1.0 / dataloaders['val'].batch_size))

      running_loss = 0.0
      running_corrects = 0

      # Iterate over data.
      for batch_idx, batch_data in tqdm_func(enumerate(dataloaders[phase]), total=num_batch_per_epoch):

        writer_step = num_batch_per_epoch * epoch + (batch_idx + 1)

        inputs = batch_data['image'].to(device)
        labels = batch_data['category'].to(device)
        img = recover_image(inputs[0].cpu(), writer=True)

        writer.add_image('image/transform', img, writer_step)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward
        # track history if only in train
        with torch.set_grad_enabled(phase == 'train'):
          # Get model outputs and calculate loss
          outputs = model(inputs)
          loss = criterion(outputs, labels)

          _, preds = torch.max(outputs, 1)

          # backward + optimize only if in training phase
          if phase == 'train':
            loss.backward()
            optimizer.step()
            writer.add_scalar('loss/loss', loss.item(), writer_step)

        # statistics
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

      epoch_loss = running_loss / len(dataloaders[phase].dataset)
      epoch_acc = (running_corrects.double() / len(dataloaders[phase].dataset)).item()
      print('Epoch {} {} Loss: {:.4f} Acc: {:.4f}'.format(epoch, phase, epoch_loss, epoch_acc))

      # record the history
      train_performance_history.append({
        'epoch': epoch,
        'train_loss': epoch_loss,
        'train_acc': epoch_acc
      })

      # deep copy the model
      if phase == 'val':
        val_performance_history.append({
          'epoch': epoch,
          'val_loss': epoch_loss,
          'val_acc': epoch_acc
        })

        writer.add_scalar('loss/val_loss', epoch_loss, writer_step)
        writer.add_scalar('accuracy/val_acc', epoch_acc, writer_step)
        if epoch_loss < best_log_loss:
          best_log_loss = epoch_loss
          best_acc = epoch_acc
          best_model_wts = copy.deepcopy(model.state_dict())
      else:
        writer.add_scalar('accuracy/train_acc', epoch_acc, writer_step)

    print()
  time_elapsed = time.time() - since
  print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
  print('Best validation Loss: {:4f} Acc: {:.4f}'.format(best_log_loss, best_acc))

  # process the full history
  full_performance_history = pd.DataFrame(train_performance_history).merge(
    pd.DataFrame(val_performance_history), on='epoch', how='left')

  # load best model weights
  model.load_state_dict(best_model_wts)

  writer.close()
  return model, full_performance_history

In [0]:
#This function freezes all the gradients of the model if transfer learning is used
def set_parameter_requires_grad(model, feature_extract):
	if feature_extract:
		for param in model.parameters():
			param.requires_grad = False

In [0]:
#This function initializes the model based on the name passed to it, only resnet is tested
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
	model_ft = None

	if 'resnet' in model_name:

		resnet_model_mapping = {
			'resnet18': models.resnet18(pretrained=use_pretrained),
			'resnet34': models.resnet34(pretrained=use_pretrained),
			'resnet50': models.resnet50(pretrained=use_pretrained),
			'resnet101': models.resnet101(pretrained=use_pretrained),
			'resnet152': models.resnet152(pretrained=use_pretrained)
		}

		model_ft = resnet_model_mapping[model_name]
		set_parameter_requires_grad(model_ft, feature_extract)

		num_ftrs = model_ft.fc.in_features
		model_ft.fc = nn.Linear(num_ftrs, num_classes)
	else:
		print("Invalid model name, exiting...")
		exit()

	return model_ft

In [0]:
#This function is to undersample categories with large proportion of data
def undersample(category, sample_size, sample_df):
  cat_under = sample_df[sample_df['Category'] == category]
  return cat_under.sample(sample_size)

In [0]:
#This function is to oversample categories with too little data
def oversample(category, sample_size, sample_df):
  cat_over = sample_df[sample_df['Category'] == category]
  return cat_over.sample(sample_size, replace=True)

###Define neccessary class

In [0]:
class ProductCategoryDataset(Dataset):
  """Product category dataset."""

  def __init__(self, csv_file, root_dir, transform=None, train=True):
    """
    Args:
      csv_file (string): Path to the csv file with annotations.
      root_dir (string): Directory with all the images.
      transform (callable, optional): Optional transform to be applied
        on a sample.
    """
    self.data = pd.read_csv(csv_file)
    self.root_dir = root_dir
    self.transform = transform
    self.train = train

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    #Some image_path do not have .jpg which will lead to error
    image_path = self.data.iloc[idx]['image_path']
    if(image_path[-3:] != "jpg"):
      image_path += ".jpg"
    im_path = os.path.join(self.root_dir, image_path)
    image = Image.open(im_path).convert('RGB')

    if self.train:
      data_sample = {
        'image': image,
        'id': self.data.iloc[idx]['itemid'],
        'category': self.data.iloc[idx]['Category']
      }
    else:
      data_sample = {
        'image': image,
        'id': self.data.iloc[idx]['itemid']
      }

    if self.transform:
      data_sample['image'] = self.transform(image)

    return data_sample

###Explore Train Data

In [0]:
train_labels = pd.read_csv('/content/gdrive/My Drive/Data/train.csv')
print(train_labels.shape)
display(train_labels.head())

In [0]:
category_count = train_labels.groupby('Category', as_index=False).agg({'itemid': 'count'}).rename(
    columns={'itemid': 'count'}).sort_values('count', ascending=False).reset_index(drop=False)
display(category_count.head())
print(category_count.shape)

In [0]:
#Display total number of data by category
sns.set_style('whitegrid')
plt.figure(figsize=(16, 8))
_ = sns.barplot(x=np.arange(category_count.shape[0]), y=category_count['count'].values, color='lightblue')
_ = plt.xticks(range(category_count.shape[0]), category_count['Category'].values, rotation=90)

In [0]:
category_count_asc = train_labels.groupby('Category', as_index=False).agg({'itemid': 'count'}).rename(
    columns={'itemid': 'count'}).sort_values('count', ascending=True).reset_index(drop=False)
display(category_count_asc.head(10))

###Under sampling of majority categories and oversampling of minority categories

In [0]:
cat_0_under = undersample(0, 400, train_labels)
cat_1_under = undersample(1, 400, train_labels)
cat_2_under = undersample(2, 400, train_labels)
cat_3_under = undersample(3, 400, train_labels)
cat_4_under = undersample(4, 400, train_labels)
cat_5_under = undersample(5, 400, train_labels)
cat_6_under = undersample(6, 400, train_labels)
cat_7_under = undersample(7, 400, train_labels)
cat_8_under = undersample(8, 400, train_labels)
cat_9_under = undersample(9, 400, train_labels)
cat_10_under = undersample(10, 400, train_labels)
cat_11_under = undersample(11, 400, train_labels)
cat_12_under = undersample(12, 400, train_labels)
cat_13_under = undersample(13, 400, train_labels)
cat_14_under = undersample(14, 400, train_labels)
cat_15_under = undersample(15, 400, train_labels)
cat_16_under = undersample(16, 400, train_labels)
cat_17_under = undersample(17, 400, train_labels)
cat_18_under = undersample(18, 400, train_labels)
cat_19_under = undersample(19, 400, train_labels)
cat_20_under = undersample(20, 400, train_labels)
cat_21_under = undersample(21, 400, train_labels)
cat_22_under = undersample(22, 400, train_labels)
cat_23_under = undersample(23, 400, train_labels)
cat_24_under = undersample(24, 400, train_labels)
cat_25_under = undersample(25, 400, train_labels)
cat_26_under = undersample(26, 400, train_labels)
cat_27_under = undersample(27, 400, train_labels)
cat_28_under = undersample(28, 400, train_labels)
cat_29_under = undersample(29, 400, train_labels)
cat_30_under = undersample(30, 400, train_labels)
cat_31_under = undersample(31, 400, train_labels)
cat_32_under = undersample(32, 400, train_labels)
cat_33_under = undersample(33, 400, train_labels)
cat_34_under = undersample(34, 400, train_labels)
cat_35_under = undersample(35, 400, train_labels)
cat_36_under = undersample(36, 400, train_labels)
cat_37_under = undersample(37, 400, train_labels)
cat_38_under = undersample(38, 400, train_labels)
cat_39_under = undersample(39, 400, train_labels)
cat_41_under = undersample(41, 400, train_labels)
cat_42_under = undersample(42, 400, train_labels)
cat_43_under = undersample(43, 400, train_labels)
cat_44_under = undersample(44, 400, train_labels)
cat_45_under = undersample(45, 400, train_labels)
cat_46_under = undersample(46, 400, train_labels)
cat_47_under = undersample(47, 400, train_labels)
cat_48_under = undersample(48, 400, train_labels)
cat_49_under = undersample(49, 400, train_labels)
cat_51_under = undersample(51, 400, train_labels)
cat_53_under = undersample(53, 400, train_labels)

In [0]:
cat_40_over = oversample(40, 400, train_labels)
cat_54_over = oversample(54, 400, train_labels)
cat_50_over = oversample(50, 400, train_labels)
cat_56_over = oversample(56, 400, train_labels)
cat_55_over = oversample(55, 400, train_labels)
cat_52_over = oversample(52, 400, train_labels)
cat_57_over = oversample(57, 400, train_labels)

In [0]:
train_balanced = pd.concat([cat_0_under, cat_1_under, cat_2_under, cat_3_under, cat_4_under, cat_5_under, cat_6_under, cat_7_under, cat_8_under, cat_9_under\
                           , cat_10_under, cat_11_under, cat_12_under, cat_13_under, cat_14_under, cat_15_under, cat_16_under, cat_17_under, cat_18_under, cat_19_under\
                           , cat_20_under, cat_21_under, cat_22_under, cat_23_under, cat_24_under, cat_25_under, cat_26_under, cat_27_under, cat_28_under, cat_29_under\
                           , cat_30_under, cat_31_under, cat_32_under, cat_33_under, cat_34_under, cat_35_under, cat_36_under, cat_37_under, cat_38_under, cat_39_under\
                           , cat_40_over, cat_41_under, cat_42_under, cat_43_under, cat_44_under, cat_45_under, cat_46_under, cat_47_under, cat_48_under, cat_49_under\
                           , cat_50_over, cat_51_under, cat_52_over, cat_53_under, cat_54_over, cat_55_over, cat_56_over, cat_57_over])

In [0]:
print(train_balanced.shape)
display(train_balanced.head())

In [0]:
category_count = train_balanced.groupby('Category', as_index=False).agg({'itemid': 'count'}).rename(
    columns={'itemid': 'count'}).sort_values('count', ascending=False).reset_index(drop=False)
display(category_count.head())
print(category_count.shape)

In [0]:
sns.set_style('whitegrid')
plt.figure(figsize=(16, 8))
_ = sns.barplot(x=np.arange(category_count.shape[0]), y=category_count['count'].values, color='lightblue')
_ = plt.xticks(range(category_count.shape[0]), category_count['Category'].values, rotation=90)

###Data Preparation

In [0]:
from sklearn.model_selection import StratifiedShuffleSplit

####Stratified sampling for train and validation set

In [0]:
sss = StratifiedShuffleSplit(n_splits=1, random_state=25, test_size=0.1)

In [0]:
train_balanced = train_balanced.reset_index(drop=True)

for train_index, valid_index in sss.split(X=train_balanced.index.values, y=train_balanced['Category'].values):
    train_train_df, train_valid_df = train_balanced.iloc[train_index], train_balanced.iloc[valid_index]
    
train_train_df = train_train_df.reset_index(drop=True)
train_valid_df = train_valid_df.reset_index(drop=True)

print(train_train_df.shape)
print(train_valid_df.shape)

In [0]:
plt.figure(figsize=(16, 6))
sns.countplot(x=train_train_df['Category'].values, order=range(58), color='lightblue')
_ = plt.xticks([])
_ = plt.title('train')

In [0]:
plt.figure(figsize=(16, 6))
sns.countplot(train_valid_df['Category'].values, order=range(58), color='lightgreen')
_ = plt.xticks([])
_ = plt.title('validation')

In [0]:
train_train_df.to_csv('/content/gdrive/My Drive/Data/processed_data/train_train_dataset.csv', index=False)
train_valid_df.to_csv('/content/gdrive/My Drive/Data/processed_data/train_valid_dataset.csv', index=False)

###Train Data Preparation

In [0]:
# set the path and directory for train and validation
train_dataset_csv = '/content/gdrive/My Drive/Data/processed_data/train_train_dataset.csv'
val_dataset_csv = '/content/gdrive/My Drive/Data/processed_data/train_valid_dataset.csv'
train_root_dir = '/content'
test_root_dir = '/content'

In [0]:
model_name = 'resnet152'
input_size = 224

# Number of classes in the dataset
num_classes = 58

# Batch size for training
batch_size = 32

# Number of epochs to train for 
num_epochs = 10

# Flag for feature extracting. When False, we finetune the whole model, 
# when True we only update the reshaped layer params
# Here, it is set as True as we are making use of transfer learning
feature_extract = True

crop_size = input_size
# we use (scale_size: 256, crop_size: 224)
# for resize and cropping
scale_size = 256

In [0]:
# transforms settings
train_transform_simple = get_train_time_transform_simple(scale_size=scale_size, crop_size=crop_size)
test_transform = get_test_time_transform(scale_size=crop_size, crop_size=crop_size)

In [0]:
# datasets with transforms
train_dataset_simple = ProductCategoryDataset(
    csv_file=train_dataset_csv, root_dir=train_root_dir, transform=train_transform_simple)
val_dataset = ProductCategoryDataset(
    csv_file=val_dataset_csv, root_dir=train_root_dir, transform=test_transform)

In [0]:
# data loaders
train_data_loader_simple = DataLoader(
    train_dataset_simple, batch_size=batch_size, shuffle=True, num_workers=4)
val_data_loader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# data loaders in dict
data_loaders_simple = {
    'train': train_data_loader_simple,
    'val': val_data_loader
}

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('you are using device: ', device)

###Initialize Model

In [0]:
# Initialize the model for this run
model_ft = initialize_model(
    model_name=model_name, num_classes=num_classes, feature_extract=feature_extract, 
    use_pretrained=True)

In [0]:
# Print the model we just instantiated
print(model_ft)

In [0]:
# Send the model to GPU
model_ft = model_ft.to(device)

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are 
#  doing feature extract method for transfer learning, we will only update 
#  the parameters that we have just initialized, i.e. the parameters with 
#  requires_grad is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
    params_to_update = []
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

In [0]:
# Define loss type
logloss_criterion = torch.nn.CrossEntropyLoss()

# Only parameters of final layer are being optimized as we are making use of transfer learning
adam_optimizer = torch.optim.Adam(params_to_update, lr=0.0005, weight_decay=0.0001)

###Training

In [0]:
# model training
trained_model, full_performance_history = train_model_visual(
    model=model_ft, device=device, dataloaders=data_loaders_simple, 
    criterion=logloss_criterion, optimizer=adam_optimizer, scheduler=None, 
    num_epochs=num_epochs, in_notebook=True)

In [0]:
# Save the trained model
torch.save(trained_model.state_dict(), '/content/gdrive/My Drive/Data/model/resnet152_fe_adam_10epoch_simple_0.0005.pt')

In [0]:
display(full_performance_history.head())

###Prepare Model for Prediction

In [0]:
# Initialize the model for this run
model = initialize_model(
    model_name=model_name, num_classes=num_classes, feature_extract=feature_extract, 
    use_pretrained=True)

In [0]:
model.load_state_dict(torch.load('/content/gdrive/My Drive/Data/model/resnet152_fe_adam_10epoch_simple_0.0005.pt'))
model.eval()

###Explore sample submission

In [0]:
sample_submission = pd.read_csv('/content/gdrive/My Drive/Data/data_info_val_sample_submission.csv')

print(sample_submission.shape)
display(sample_submission.head())

###Explore Test Data Set

In [0]:
test_labels = pd.read_csv('/content/gdrive/My Drive/Data/test.csv')
print(test_labels.shape)
display(test_labels.head())

In [0]:
test_samples = test_labels.sample(16)
display(test_samples.head())

###Prepare Test Data Set

In [0]:
min_img_size = 224  # The min size, as noted in the PyTorch pretrained models doc, is 224 px.
transform_pipeline = transforms.Compose([transforms.Resize(min_img_size),
                                         transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                              std=[0.229, 0.224, 0.225])])

###Predict

In [0]:
#Reset results to accumulate predictions
results = pd.DataFrame(columns = ["itemid", "Category"])

In [0]:
for i in range(0, len(test_labels.index)):
  image_path = test_labels['image_path'][i]
  if(image_path[-3:] != "jpg"):
      image_path += ".jpg"
  img = Image.open('/content/' + image_path)
  img = transform_pipeline(img)
  img = img.unsqueeze(0)  # Insert the new axis at index 0 i.e. in front of the other axes/dims. 
  prediction = model(img)  # Returns a Tensor of shape (batch, num class labels)
  prediction = prediction.data.numpy().argmax()  # Our prediction will be the index of the class label with the largest value.
  results = results.append({'itemid': test_labels['itemid'][i], 'Category': prediction}, ignore_index=True)

In [0]:
results.to_csv('/content/gdrive/My Drive/Data/predictions.csv', index = false)

#Text with Visual Model

In [0]:
# Reset results to accumulate predictions
results = pd.DataFrame(columns = ["itemid", "Category"])

In [0]:
import re
from nltk.tokenize.toktok import ToktokTokenizer
nltk.download('stopwords')
tokenizer = ToktokTokenizer()

stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

#use porter stemmer
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
  
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

#remove digits
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def normalize_corpus(corpus):
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        #if text_lemmatization:
         #   doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        #if special_char_removal:
            # insert spaces between special characters to isolate them    
        doc=simple_stemmer(doc)
        special_char_pattern = re.compile(r'([{.(-)!}])')
        doc = special_char_pattern.sub(" \\1 ", doc)
        doc = remove_special_characters(doc, remove_digits=True)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        #if stopword_removal:
        doc = remove_stopwords(doc, is_lower_case=True) 
        normalized_corpus.append(doc)
    return normalized_corpus
  
trainDF['title']=normalize_corpus(trainDF['title'])
test['title']=normalize_corpus(test['title'])

In [0]:
#splilt into 3 categories
beauty = trainDF[trainDF["Category"] <= 16]
fashion = trainDF[trainDF["Category"] >= 17]
fashion = fashion[fashion["Category"] <= 30]
mobile = trainDF[trainDF["Category"] >= 31]
print((len(beauty.index) + len(fashion.index) + len(mobile.index)) == len(trainDF.index))

In [0]:
beauty_x=beauty['title']
beauty_y=beauty['Category']

fashion_x=fashion['title']
fashion_y=fashion['Category']

mobile_x=mobile['title']
mobile_y=mobile['Category']

In [0]:
# word level tf-idf for beauty, fashion and mobile

#beauty
beauty_tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
beauty_tfidf_vect.fit(beauty['title'])
beauty_tfidf =  beauty_tfidf_vect.transform(beauty_x)

#fashion
fashion_tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
fashion_tfidf_vect.fit(fashion['title'])
fashion_tfidf =  fashion_tfidf_vect.transform(fashion_x)

#mobile
mobile_tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
mobile_tfidf_vect.fit(mobile['title'])
mobile_tfidf =  mobile_tfidf_vect.transform(mobile_x)

In [0]:
#predict one by one
def train_model(classifier, feature_vector_train, label):
    # fit the training dataset on the classifier
    model=classifier.fit(feature_vector_train, label)
    return model

def predict(model,title):
    prediction= model.predict(title)
    return prediction;

In [0]:
#train model for 3 cat:
beauty_model=train_model(linear_model.LogisticRegression(),beauty_tfidf , beauty_y)
fashion_model=train_model(linear_model.LogisticRegression(),fashion_tfidf, fashion_y)
mobile_model=train_model(linear_model.LogisticRegression(),mobile_tfidf, mobile_y)

In [0]:
visual_predictions = pd.read_csv('/content/gdrive/My Drive/Data/predictions.csv')

for i in range(0, len(visual_predictions.index)):
  if(visual_predictions["Category"][i] <= 16):
    #beauty
    title = test["title"][i]
    test_beauty_tfidf= beauty_tfidf_vect.transform([title])
    prediction=int(predict(beauty_model,test_beauty_tfidf))
    results= results.append ({'itemid':test['itemid'][i], 'Category': prediction}, ignore_index=True)
    
  elif(17 <= visual_predictions["Category"][i] <= 30):
    #fashion
    title = test["title"][i]
    test_fashion_tfidf= fashion_tfidf_vect.transform([title])
    prediction=int(predict(fashion_model,test_fashion_tfidf))
    results= results.append ({'itemid':test['itemid'][i], 'Category': prediction}, ignore_index=True)
    
  else:
    title = test["title"][i]
    #mobile
    test_mobile_tfidf= mobile_tfidf_vect.transform([title])
    prediction=int(predict(mobile_model,test_mobile_tfidf))
    results= results.append ({'itemid':test['itemid'][i], 'Category': prediction}, ignore_index=True)

In [0]:
results.to_csv(''C:/Users/alyta/Downloads/year 2/year 2 sem 2/NDSC/dataset/predictions_new.csv', index = False)