In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import datetime
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy import sparse
import pickle
from nltk import ngrams

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


<p style='font-size:20px'><b> Featurization </b></p>
<p>

So far we have the following data format for the input (X) & output (Y) for training the model:

- Input (X): Single string containing space separated tokens generated after processing the raw information in Title & Body.
- Output (Y): Single string containing space separated tags after shortlisting the top 500 most frequent tags.
    
This still need to be encoded as numerical featurization for applying a ML model. To enable the same we will use the following techniques for featurization:
    
A) Input (X): Tf-Idf transformation: 

- For each datapoint the features will be a value calculated for all it's tokens.
- The token is assigned a value as : (term frequency) x (inverse document frequeny)
    - term frequency: Number of times the token occurs in the datapoint
    - inverse document frequeny: Inverse of the number of datapoints having the token.
- The tokenizer for title3_body & code will be different:
    - title3_body: word ngrams with a range of (1, 3) words
    - code: char ngrams with a range of (1, 3) words
        
B) Output (Y): One-hot encoding:

- We will create 500 target vectors each representing the presence of the target 500 tags in each datapoints
- It will then have a shape of (n x m) where:
    - n: Number of datapoints in train/test data
    - m: Number of target tags to be predicted(=500)

This will allow us to train models for this task of multi-label classification of predicting all the relevant tags that could be present in the data.            
    
</p>

In [2]:
# Load the processed train, CV & test to be featurized
train_data = pd.read_csv('dataset/processed_train.csv')
cv_data = pd.read_csv('dataset/processed_cv.csv')
test_data = pd.read_csv('dataset/processed_test.csv')

In [3]:
# Converting title3_body column of train, cv & test to TF-IDF Vectorization form

start_time = datetime.datetime.now()

tf_idf_vect = TfidfVectorizer(max_features=150000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x:str(x).split(" "), sublinear_tf=False, ngram_range=(1,3))

train_title3_body = tf_idf_vect.fit_transform(train_data.Title3_Body)
cv_title3_body = tf_idf_vect.transform(cv_data.Title3_Body)
test_title3_body = tf_idf_vect.transform(test_data.Title3_Body)

print("Block execution time: ", datetime.datetime.now() - start_time)

Block execution time:  0:00:44.226823


In [4]:
# Converting code column of train, cv & test to TF-IDF Vectorization form

start_time = datetime.datetime.now()

tf_idf_vect2 = TfidfVectorizer(max_features=50000, smooth_idf=True, norm="l2", \
                             analyzer = 'char', sublinear_tf=False, ngram_range=(1,3))

train_code = tf_idf_vect2.fit_transform(train_data.Code)
cv_code = tf_idf_vect2.transform(cv_data.Code)
test_code = tf_idf_vect2.transform(test_data.Code)

print("Block execution time: ", datetime.datetime.now() - start_time)

Block execution time:  0:00:59.810566


In [5]:
# Merging the Tf-Idf vectorized forms of title3_body & code columns to build the final input X data

train_x = sparse.hstack((train_title3_body, train_code))
cv_x = sparse.hstack((cv_title3_body, cv_code))
test_x = sparse.hstack((test_title3_body, test_code))

In [6]:
# Checking the number of features captured by TfIdf

print(train_x.shape)
print(cv_x.shape)
print(test_x.shape)


(100000, 200000)
(25000, 200000)
(25000, 200000)


In [7]:
# Convert shortlisted tags (y-label) into One-hot encodeing to build the final output Y data
# CountVectorizer for sparse matrix representation to optimize space

start_time = datetime.datetime.now()

count_vect = CountVectorizer(analyzer = lambda x:str(x).split(" "))
train_y = count_vect.fit_transform(train_data.Tags_shortlist)
cv_y = count_vect.transform(cv_data.Tags_shortlist)
test_y = count_vect.transform(test_data.Tags_shortlist)

print("Block execution time: ", datetime.datetime.now() - start_time)

Block execution time:  0:00:00.529947


In [8]:
# Saving the train,cv & test input & output to be used for building models
#### Imp: Careful not to overwrite data ####

var = [train_x, train_y, cv_x, cv_y, test_x, test_y]
file_names = ['train_x', 'train_y', 'cv_x', 'cv_y', 'test_x', 'test_y']

for i in range(len(var)):
    
    # Storing the target file path where the data would be stored 
    target_file = "dataset/" + file_names[i] + ".npz"
    
    # Saving the sparse matrix data
    sparse.save_npz(target_file, var[i])


In [9]:
# Saving the vocab captured while fitting CountVectorizer for output tags
#### Imp: Careful not to overwrite data ####

pickle.dump(count_vect.vocabulary_, open("models/vocab.pickle", "wb"))
