In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import datetime
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy import sparse
import pickle
from nltk import ngrams

[nltk_data] Downloading package punkt to C:\Users\Tejas
[nltk_data]     Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Tejas
[nltk_data]     Chavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<p style='font-size:20px'><b> Featurization </b></p>
<p>

Applying the same featurization steps that were used in the old dataset of 1.5L points.  
 
</p>

In [2]:
# Load the processed train, CV & test to be featurized
train_data = pd.read_csv('dataset 5L/processed_train.csv')
cv_data = pd.read_csv('dataset 5L/processed_cv.csv')
test_data = pd.read_csv('dataset 5L/processed_test.csv')

In [3]:
# Converting  train & test to TF-IDF Vectorization form
# Hyperparameter tuning of the max_features or the min_df(=0.00009)

start_time = datetime.datetime.now()

tf_idf_vect = TfidfVectorizer(max_features=200000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x:str(x).split(" "), sublinear_tf=False, ngram_range=(1,3))

train_title3_body = tf_idf_vect.fit_transform(train_data.Title3_Body)
cv_title3_body = tf_idf_vect.transform(cv_data.Title3_Body)
test_title3_body = tf_idf_vect.transform(test_data.Title3_Body)

print("Block execution time: ", datetime.datetime.now() - start_time)



Block execution time:  0:03:05.183698


In [4]:
# Converting  train & test to TF-IDF Vectorization form
# Hyperparameter tuning of the max_features or the min_df(=0.00009)

start_time = datetime.datetime.now()

tf_idf_vect2 = TfidfVectorizer(max_features=50000, smooth_idf=True, norm="l2", \
                             analyzer = 'char', sublinear_tf=False, ngram_range=(1,3))

train_code = tf_idf_vect2.fit_transform(train_data.Code)
cv_code = tf_idf_vect2.transform(cv_data.Code)
test_code = tf_idf_vect2.transform(test_data.Code)

print("Block execution time: ", datetime.datetime.now() - start_time)

Block execution time:  0:03:38.097581


In [5]:
train_x = sparse.hstack((train_title3_body, train_code))
cv_x = sparse.hstack((cv_title3_body, cv_code))
test_x = sparse.hstack((test_title3_body, test_code))

In [6]:
# Checking the number of features captured by TfIdf

print(train_x.shape)
print(cv_x.shape)
print(test_x.shape)


(315915, 250000)
(78979, 250000)
(80883, 250000)


In [7]:
# Convert shortlisted tags (y-label) into One-hot encodeing. 
# CountVectorizer for sparse matrix representation to optimize space

start_time = datetime.datetime.now()

count_vect = CountVectorizer(analyzer = lambda x:str(x).split(" "))
train_y = count_vect.fit_transform(train_data.Tags_shortlist)
cv_y = count_vect.transform(cv_data.Tags_shortlist)
test_y = count_vect.transform(test_data.Tags_shortlist)

print("Block execution time: ", datetime.datetime.now() - start_time)

Block execution time:  0:00:01.388831


In [8]:
# Saving the train/test input & output to be used for building models
#### Imp: Careful not to overwrite data ####

var = [train_x, train_y, cv_x, cv_y, test_x, test_y]
file_names = ['train_x', 'train_y', 'cv_x', 'cv_y', 'test_x', 'test_y']

for i in range(len(var)):
    
    # Storing the target file path where the data would be stored 
    target_file = "dataset 5L/" + file_names[i] + ".npz"
    
    # Saving the sparse matrix data
    sparse.save_npz(target_file, var[i])


In [9]:
# Saving the vocab captured while fitting CountVectorizer
#### Imp: Careful not to overwrite data ####

pickle.dump(count_vect.vocabulary_, open("models 5L/vocab_5L.pickle", "wb"))
