In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load data in the format outline in FormTranscripts
with open('data/transcripts.pickle', 'rb') as f:
    transcripts = pickle.load(f)
with open('data/id_to_row.pickle', 'rb') as f:
    id_to_row = pickle.load(f)

In [3]:
print(transcripts[1300])

['Cummins, Inc.', '2017-02-09 15:24:33', ['good day ladies gentlemen welcome q4 2016 cummins earnings conference call time participants mode later conduct session instructions follow time would like turn call mark smith vice president finance operations please go ahead', 'thank good morning everyone welcome teleconference today discuss cummins results fourth quarter 2016 joining today chairman chief executive officer tom linebarger chief financial officer pat ward president chief operating officer rich freeland start please note information hear given today consist statements within meaning securities exchange act 1934 statements express forecast expectations hopes beliefs intentions strategies regarding future actual future results could differ materially projected', 'statements number risks uncertainties information regarding risks uncertainties available disclosure statement slide deck filings sec particularly risk factors section recently filed annual report form subsequently filed

In [4]:
print(id_to_row['2017-02-09 15:24:33'])

(7378, 91406)


In [5]:
# Loop through all available words to create a corpus
corpus = []
for i in range(len(transcripts)): # Adjust after testing
    statement = transcripts[i][2]
    for j in range(len(statement)): corpus.append(statement[j])   
    qna = transcripts[i][3]
    for j in range(len(qna)): corpus.append(qna[j][0])

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
vectorized_transcripts = []
for i in range(len(transcripts)):
    new_transcript, vectorized_statement, vectorized_qna = [], [], []
    new_transcript.append(transcripts[i][0])
    new_transcript.append(transcripts[i][1])
    
    statement = transcripts[i][2]
    for j in range(len(statement)): 
        vectorized_statement.append(vectorizer.transform([statement[j]]))  
    new_transcript.append(vectorized_statement)
    
    qna = transcripts[i][3]
    for j in range(len(qna)): 
        qna_vec = vectorizer.transform([qna[j][0]])
        vectorized_qna.append((qna_vec, qna[j][1]))
    new_transcript.append(vectorized_qna)
    vectorized_transcripts.append(new_transcript)

In [8]:
print(vectorized_transcripts[1300])

['Cummins, Inc.', '2017-02-09 15:24:33', [<1x65352 sparse matrix of type '<class 'numpy.int64'>'
	with 31 stored elements in Compressed Sparse Row format>, <1x65352 sparse matrix of type '<class 'numpy.int64'>'
	with 55 stored elements in Compressed Sparse Row format>, <1x65352 sparse matrix of type '<class 'numpy.int64'>'
	with 45 stored elements in Compressed Sparse Row format>, <1x65352 sparse matrix of type '<class 'numpy.int64'>'
	with 47 stored elements in Compressed Sparse Row format>, <1x65352 sparse matrix of type '<class 'numpy.int64'>'
	with 53 stored elements in Compressed Sparse Row format>, <1x65352 sparse matrix of type '<class 'numpy.int64'>'
	with 53 stored elements in Compressed Sparse Row format>, <1x65352 sparse matrix of type '<class 'numpy.int64'>'
	with 51 stored elements in Compressed Sparse Row format>, <1x65352 sparse matrix of type '<class 'numpy.int64'>'
	with 57 stored elements in Compressed Sparse Row format>, <1x65352 sparse matrix of type '<class 'numpy.

In [15]:
with open('embeddings/vectorized_transcripts.pickle', 'wb') as f:
    pickle.dump(vectorized_transcripts, f)

In [None]:
with open('embeddings/vectorized_transcripts.pickle', 'rb') as f:
    test_vectorized_transcripts_load = pickle.load(f)

In [None]:
print(test_vectorized_transcripts_load[1300])