In [1]:
import pandas as pd
import sys
import pickle
from fasttext import load_model
import gensim
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import collections
import numpy as np

In [4]:
data = pd.read_csv(r"..\Dataset3\IAB_News_tokenized.csv")

In [5]:
len(data)

280000

In [8]:
pd.set_option('display.max_colwidth', None)
for txt in data[0:5]['Text']:
    print(txt.strip())

new delhi indian men hockey team medal less rudder less commonwealth game performance bring chief coach sjoerd marijne scanner hockey india schedule performance review meeting later week also read cwg indian men hockey team lose england bronze play offit learn senior player team lead skipper manpreet singh pr sreejesh rupinder pal singh meet top hi official tuesday present explanation performance first time year since india fail win single medal hockey cwg go well national federation say win shy away take tough decision review meeting accord hockey india official player coach performance also scrutinise corrective measure take need three important tournament champion trophy asian game season end world cup line year performance gold coast definitely acceptable par say least provide facility team fail deliver big event hi official say one medal favourites look perform low ranked team like wale pakistan new zealand england india particularly poor draw pakistan concede equaliser seven seco

In [5]:
data[data.isnull().any(axis=1)].index

Index([], dtype='int64')

In [6]:
def find_unique_words(strings):
        combined_string = ' '.join(strings)
        words = combined_string.split()
        unique_words = set(words)
        return unique_words

unique_words = find_unique_words(data["Text"])
print(len(unique_words))

483567


In [7]:
# load the vectorizer model
vectorizer = gensim.models.KeyedVectors.load_word2vec_format(r'..\Vectorizer\FastText-300d-1M.vec')

In [8]:
# fit the keras tokenizer on the entire dataset
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data['Text'])

In [9]:
# + 1 to account for padding token.
num_tokens = len(tokenizer.word_index) + 1

# Initialize a matrix of zeroes of size: vocabulary x embedding dimension.
embedding_dim = 300
embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in tokenizer.word_index.items():
  if vectorizer.has_index_for(word):
    embedding_matrix[i] = vectorizer[word].copy()

In [10]:
# Quick check
print(embedding_matrix[tokenizer.word_index['great']][:50])

[-0.0129 -0.0311  0.0133  0.0051 -0.0395 -0.0044 -0.0218 -0.0483  0.021
  0.0186 -0.0313  0.0012  0.0194 -0.0124  0.0116 -0.0149  0.0489  0.0029
  0.0437 -0.0069 -0.0129  0.0165 -0.0162  0.0322  0.0181 -0.01    0.0173
 -0.0312  0.0552 -0.0006 -0.0004 -0.0177  0.0048 -0.0616  0.0065 -0.0015
  0.0203 -0.0142 -0.0047  0.0054  0.0096  0.0071 -0.0081 -0.0085 -0.0088
  0.0129  0.0017 -0.0259  0.0174  0.0354]


In [11]:
with open(r'..\Dataset3\embeddingMatrixDS3.pkl', 'wb') as f:
    pickle.dump(embedding_matrix, f)

In [12]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['Text'], np.array(data['Category']), test_size=0.15, random_state=42)

In [13]:
# Split the train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

In [14]:
print(collections.Counter(y_train))
print(collections.Counter(y_val))
print(collections.Counter(y_test))

Counter({'food and drinks': 19189, 'real estate': 19149, 'family and relationships': 19073, 'news and politics': 19041, 'sports': 19031, 'style and fashion': 19003, 'arts and culture': 19002, 'hobbies and interests': 18981, 'business and finance': 18977, 'healthy living': 18954})
Counter({'news and politics': 4888, 'style and fashion': 4799, 'hobbies and interests': 4795, 'sports': 4761, 'arts and culture': 4755, 'family and relationships': 4752, 'business and finance': 4737, 'healthy living': 4717, 'real estate': 4710, 'food and drinks': 4686})
Counter({'healthy living': 4329, 'business and finance': 4286, 'arts and culture': 4243, 'hobbies and interests': 4224, 'sports': 4208, 'style and fashion': 4198, 'family and relationships': 4175, 'real estate': 4141, 'food and drinks': 4125, 'news and politics': 4071})


In [15]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [16]:
print(X_val_seq[0])

[13, 67, 67, 143, 52, 2668, 2651, 413, 1147, 195, 6, 331, 330, 542, 6992, 20506, 1555, 38, 104, 4491, 20, 5362, 360, 1149, 6089, 8, 218, 10024, 298, 1685, 1472, 2367, 1839, 3124, 858, 10727, 203, 106918, 721, 517, 2071, 50141, 183, 491, 6992, 3495, 711, 858, 203, 3124, 442, 79, 47, 1473, 1839, 21, 3124, 9, 850, 149, 6992, 20506, 2755, 2651, 1, 357, 7946, 2, 1, 6992, 20506, 885, 850, 151, 9, 50536, 297, 1547, 17994, 381, 1190, 6851, 228, 6992, 20506, 3853, 16, 90, 309, 2071, 101, 907, 7, 896, 54, 1147, 195, 330, 542, 6992, 20506, 1]


In [17]:
[tokenizer.index_word[x] for x in X_val_seq[0][:5]]

['new', 'delhi', 'delhi', 'chief', 'minister']

In [18]:
tokenizer.sequences_to_texts([X_val_seq[0]])[0][:300]

'new delhi delhi chief minister arvind kejriwal monday urge centre take step ensure security kashmiri pandits wake last week killing government servant community rahul bhat get job clerk special employment package migrant gun terrorist inside tehsil office chadoora town central kashmir budgam distric'

In [19]:
MAX_NEWS_LEN = 500
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=MAX_NEWS_LEN, padding='post')
X_val_padded = keras.preprocessing.sequence.pad_sequences(X_val_seq, maxlen=MAX_NEWS_LEN, padding='post')
X_test_padded = keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=MAX_NEWS_LEN, padding='post')

In [20]:
with open(r'..\Dataset3\Dataset3Splits\X_train.pkl', 'wb') as f:
    pickle.dump(X_train_padded, f)

with open(r'..\Dataset3\Dataset3Splits\X_val.pkl', 'wb') as f:
    pickle.dump(X_val_padded, f)

with open(r'..\Dataset3\Dataset3Splits\X_test.pkl', 'wb') as f:
    pickle.dump(X_test_padded, f)

with open(r'..\Dataset3\Dataset3Splits\y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)

with open(r'..\Dataset3\Dataset3Splits\y_val.pkl', 'wb') as f:
    pickle.dump(y_val, f)

with open(r'..\Dataset3\Dataset3Splits\y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)