In [3]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from sklearn.cross_decomposition import PLSRegression

In [4]:
#np.load("train_des.npy")

In [5]:
#np.load("test_des.npy")

In [6]:
num_train_file = 10000
num_test_file = 2000

In [7]:
def strip_punctuation(data):
    for i in range(len(data)):
        data[i]=''.join([letter for letter in data[i] if letter not in punctuation])
    return data

def lowercase(data):
    return [x.lower() for x in data]

stop_words = set(stopwords.words('english'))
def strip_stop_words(data):
    for i in range(len(data)):
        word_tokens = word_tokenize(data[i])
        filtered_sentence = []
        for word in word_tokens:
            if word not in stop_words:
                filtered_sentence.append(word)
        data[i] = " ".join(filtered_sentence)
    return data

def lemmatizer(data):
    lemmatizer = WordNetLemmatizer()
    for i in range(len(data)):
        word_list = data[i].split(' ')
        temp = []
        for word in word_list:
            temp.append(lemmatizer.lemmatize(word, 'v'))
        data[i] = " ".join(temp)
    return data

In [8]:
def generate_desc_dict(path, num_file):
    
    desc_dict = {}
    for i in range(num_file):
        filename = path + str(i) +'.txt'
        file_number = (filename.split('/')[-1]).split('.')[0]
        
        # Read files.
        file = open(filename, "r")
        contents = [line.rstrip("\n") for line in file]
        
        # Lowercase all of the words.
        contents = lowercase(contents)
        
        # Strip punctuation
        contents = strip_punctuation(contents)
        
        # Strip the stop words
        contents = strip_stop_words(contents)
        
        # Lemmatization of all the words
        contents = lemmatizer(contents)
        
        desc_dict[file_number] = contents
    
    return desc_dict

# Generate train/test description dictionaries
train_desc_dict = generate_desc_dict('./descriptions_train/', num_train_file)
test_desc_dict = generate_desc_dict('./descriptions_test/', num_test_file)

In [9]:
def generate_unique_indexed_desc_dict(desc_dict):
    
    indexed_desc_dict = {}
    desc_list = []
    for file_num, descriptions in desc_dict.items():
        for desc in descriptions:
            words = desc.split()
            for word in words:
                desc_list.append(word)
                
    desc_list = list(set(desc_list))
    
    index = 0
    for desc in desc_list:
        indexed_desc_dict[desc] = index
        index += 1
        
    return indexed_desc_dict

In [10]:
indexed_desc_dict = generate_unique_indexed_desc_dict(train_desc_dict)
print(len(indexed_desc_dict))

7658


In [11]:
def generate_desc_vectors(desc_dict, num_file):
    desc_vector = np.zeros((num_file, len(indexed_desc_dict.keys())))
    
    for file_num, descs in desc_dict.items():
        for desc in descs:
            words = desc.split()
            for word in words:                
                if word in indexed_desc_dict.keys():
                    desc_vector[int(file_num), int(indexed_desc_dict[word])] += 1
                
    return pd.DataFrame(desc_vector, columns = indexed_desc_dict.keys())

In [12]:
train_desc_vectors = generate_desc_vectors(train_desc_dict, num_train_file)
test_desc_vectors = generate_desc_vectors(test_desc_dict, num_test_file)
print(train_desc_vectors.shape)

(10000, 7658)


In [13]:
def generate_tag_dict(file_path, num_file):
    
    tags_dict = {}
    sub_category = []
    for i in range(num_file):
        filename = file_path + str(i) +'.txt'
        file_number = (filename.split('/')[-1]).split('.')[0]
        
        # Read files.
        file = open(filename, "r")
        tags = [line.rstrip("\n") for line in file]
        
        temp = []
        for tag in tags:
            tag = tag.split(':')
            temp.append(tag[1])
        tags_dict[file_number] = temp
            
    return tags_dict

In [14]:
train_tags_dict = generate_tag_dict('./tags_train/', num_train_file)
test_tags_dict = generate_tag_dict('./tags_test/', num_test_file)

In [15]:
def generate_unique_indexed_tag_dict(tag_dict):
    
    indexed_tag_dict = {}
    tag_list = []
    for tags in tag_dict.values():
        for tag in tags:
            tag_list.append(tag)
                
    tag_list = list(set(tag_list))
    
    index = 0
    for tag in tag_list:
        indexed_tag_dict[tag] = index
        index += 1
        
    return indexed_tag_dict

In [16]:
indexed_tag_dict = generate_unique_indexed_tag_dict(train_tags_dict)
print(len(indexed_tag_dict))

80


In [22]:
def generate_tags_vector(tags_dict, num_file):
    tag_vector = np.zeros((num_file, len(indexed_tag_dict.keys())))
    
    for file_num, tags in tags_dict.items():
        for tag in tags:
            if tag in indexed_tag_dict.keys():
                tag_vector[int(file_num), int(indexed_tag_dict[tag])] += 1
                
    return pd.DataFrame(tag_vector, columns = indexed_tag_dict.keys())

In [23]:
train_tag_vectors = generate_tags_vector(train_tags_dict, num_train_file)
test_tag_vectors = generate_tags_vector(test_tags_dict, num_test_file)
print(test_tag_vectors.shape)

(2000, 80)


In [24]:
from sklearn import preprocessing
train_normalized_desc_vector = train_desc_vectors.div(train_desc_vectors.sum(axis=1), axis=0)
test_normalized_desc_vector = test_desc_vectors.div(test_desc_vectors.sum(axis=1), axis=0)

In [26]:
print(train_normalized_desc_vector)
print(test_normalized_desc_vector)

      plushes  macbook  angrily  flex  structure  pistol  minute  household  \
0         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
1         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
2         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
3         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
4         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
5         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
6         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
7         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
8         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
9         0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
10        0.0      0.0      0.0   0.0   0.000000     0.0     0.0        0.0   
11        0.0      0.0      0.0   0.0   0.000000    

In [29]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC,SVC

def SVM(train, trainLabel, test):
    classifier = OneVsRestClassifier(LinearSVC(random_state=0))
    classifier.fit(train, trainLabel)
    predictions = classifier.predict(test)
    
    return predictions

# Perform SVM
test_predic = SVM(train_normalized_desc_vector.values, train_tag_vectors.values, test_normalized_desc_vector.values)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [32]:
print(test_predic[1])


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]


In [30]:
from sklearn.neighbors import NearestNeighbors

def kNN(tag_vectors, k, predic):
    neigh = NearestNeighbors(n_neighbors= k , algorithm='kd_tree').fit(tag_vectors)
    dist, idx = neigh.kneighbors(predic)
    return dist, idx

# Get top 20
dist, predic_idx = kNN(test_tag_vectors, 20, test_predic)

In [31]:
print(predic_idx)
print(predic_idx.shape)

[[1743  292 1231 ...  949  250  594]
 [ 833  589 1529 ...  784  948  758]
 [ 445  649 1866 ... 1030 1033  953]
 ...
 [ 104  199 1312 ...  731  152  429]
 [1218 1660 1535 ...   57   63   33]
 [ 926 1430  653 ...  346  405  976]]
(2000, 20)


In [None]:
import csv

# Create top 20 dictionary
result_dic = {}
for i in range(2000):
    desc_id = str(i) + '.txt'
    temp = []
    images = predic_idx[i]
    for image in images:
        temp.append(str(image) + '.jpg')
    top_20 = " ".join(temp)
    result_dic[desc_id] = top_20

# Write csv file using the dictionary above
w = csv.writer(open("./submission/submission_svm.csv", "w", newline=''))
w.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
for key, val in result_dic.items():
    w.writerow([key, val])