In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from keras.preprocessing.text import Tokenizer


Using TensorFlow backend.


Load 10K raw file and output textual features for classification using sklearn and keras.  
Inputs: 
- data/10k/10k_raw.pickle: a pandas pickle file containing gvkey, fyear, and MD&A section 

Outputs:
- data/10k/10k_index.csv: an index file that preserves the sequence 
- data/10k/X_keras_unigram.npy: unigram sequences for keras
- data/10k/word_map.pickle: a dictionary to map numbers to words

# load 10k raw text

In [16]:
print('Loading raw text data')
sec_10k = pd.read_pickle("data/10k/10k_raw.pickle")

Loading raw text data


Save gvkey-fyear indices

In [17]:
sec10k_index = sec_10k[['gvkey','fyear']]
sec10k_index.to_csv("data/10k/10k_index.csv")

# use keras to pre-process text

In [18]:
max_features = 5000 # max number of words to include (remove lower frequency words)
mda_text_list = sec_10k['mda_text'].tolist()
del sec_10k

In [19]:
# tokenize the raw text
tokenizer = Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(mda_text_list)

In [1]:
# tokenizer.word_index # a dictionary that maps word to integers

In [20]:
# X is a matrix, each row is sequence of word ids
X = tokenizer.texts_to_sequences(mda_text_list) 
X = np.array(X)

In [21]:
# save the tokenized sequence to disk
np.save("data/10k/X_keras_unigram.npy", X)
# np.save("data/10k/X_keras_unigram_20000.npy", X)
# save the word mapping to disk
pickle.dump(tokenizer.word_index, file=open("data/10k/word_map.pickle", 'wb'))
# pickle.dump(tokenizer.word_index, file=open("data/10k/word_map_20000.pickle", 'wb'))
# save the trained tokenizer
# pickle.dump(tokenizer, file=open("data/10k/tokenizer.pickle", 'wb'))
# pickle.dump(tokenizer, file=open("data/10k/tokenizer_20000.pickle", 'wb'))

In [22]:
X_tfidf = tokenizer.texts_to_matrix(mda_text_list, mode = 'tfidf')

In [23]:
X_tfidf = np.array(X_tfidf)

In [24]:
np.save("data/10k/X_tfidf.npy", X_tfidf)