# Feature Encoding

The following script formats articles by features for linear regression (first attempt).  
It takes in a list of features and a set of articles, converts to lowercase and creates an encoded matrix (dense)  
While this isn't the cleverest method, it provides a usable input for setting up our initial linear regression code.

### Limitations:
* Proper Nouns should keep their capitals
* Punctuation/Stemming etc not incorporated
* Bi-grams not accommodated
* Could be converted to space matrix
* No log function incorporated at this point
    


In [1]:
#importing libraries
import pandas as pd
import numpy as np
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import os
from pathlib import Path

In [2]:
#import pyscripter

#relevant_nbs = ['FeatureSelection.ipynb']
#relevant_nbs = pyscripter.nb_to_py(relevant_nbs)
#print("y print 2x?")
#print(relevant_nbs)
#pyscripter.import_scripts(['FeatureSelection.py'])


In [11]:
def loadData():
    DATA_DIR = "Data"
    FEATURES_DIR = os.path.join(DATA_DIR, "retailFeatureSet.csv")
    ARTICLES_DIR = os.path.join(DATA_DIR, "retailarticles YTD (new)_merged.csv")
    
    fts = pd.read_csv(FEATURES_DIR)
    fts = fts.drop(['Unnamed: 0'], axis=1)
    fts.columns = ['index']
    fts['index'] = list(map(lambda x: x.strip(), fts['index']))
    arts = pd.read_csv(ARTICLES_DIR)
    artText = arts.iloc[:,5]
    data = {'fts':fts, 'artText': artText}
    return data

In [23]:
def binEncoding(data):
    print("Binary Encoding")
    fts = data['fts']
    artText = data['artText']
    
    df_rows = []
    tokenizer = RegexpTokenizer(r'\w+')

    for art in artText:
        if type(art) == str: 
            body = art.lower()
            #body = clean_file_text(body)
            art_words = tokenizer.tokenize(body)
            df_rows.append([1 if word in art_words else 0 for word in fts['index']])
    X = pd.DataFrame(df_rows, columns = fts['index'])
    return X

In [5]:
def tfEncoding(data):
    print("tf Encoding")
    fts = data['fts']
    artText = data['artText']
    
    tf_rows = []
    for art in artText:
        body = art.lower()
        body = body.split()
        wordsCounter = Counter(body)
        tf_rows.append([wordsCounter[word] if word in wordsCounter else 0 for word in fts['index']])
    X = pd.DataFrame(tf_rows, columns = fts['index'])  
    return X

In [6]:
def tfidfEncoding(data):
    print("tifidf Encoding")
    fts = data['fts']

    # Base calculations
    binX = binEncoding(data)
    tfX = tfEncoding(data)
    
    # Calculate idf
    df_row = [binX[word].sum() for word in fts['index']]
    idf = [1/(df+1) for df in df_row]
    #transpose list (not the cleverest method)
    idf_row = []
    idf_row.append(idf)
    idf_list = pd.DataFrame(idf_row, columns = fts['index'])
    
    # Extract term frequencies
    tf = tfX.values
    # Set up loop to multiply each article (row) by the idf per term (col)
    tf_idf = []
    r, c = tf.shape
    for art in range(0,r):
        tf_idf.append(tf[art]*idf)
    tf_idf = pd.DataFrame(tf_idf, columns = fts['index'])
    X = tf_idf
    return X

In [7]:
def encoding(encodeType, **kwargs):
    # 0 for Binary Encoding
    # 1 for Term Frequency Encoding
    # 2 for TF-IDF Encoding
    # If you'd like to save as csv, use "csv = True"
        
    # Load up data
    data = loadData()
    
    # Run corresponding encoding type and pass data
    options = {0 : binEncoding,
                1 : tfEncoding,
                2 : tfidfEncoding,}
    
    X = options[encodeType](data)
    
    # Save as csv file in CLASSIFICATION data folder =)
    if ('csv' in kwargs) and (kwargs['csv']):
        
        # File path for this file
        file_name = options[encodeType].__name__ + '.csv'
        thispath = Path().absolute()
        OUTPUT_DIR = os.path.join(thispath.parent.parent, "Classification", "Data", file_name)
        print(OUTPUT_DIR)
        # if the following line throws an error, use the line after to save in same folder
        pd.DataFrame.to_csv(X, path_or_buf=OUTPUT_DIR)
        #pd.DataFrame.to_csv(X, path_or_buf=file_name)
    
    # Return Panda DataFrame
    return X
    


def main(): # Stuff to do when run from the command line    
    encoding(0, csv = True)
    pass  
 


In [10]:
#testcell

X = encoding(2, csv=True)
X.head()

0       Way back in 2015 shortly after Donald Trump an...
1       Lowes Cos under pressure to match the performa...
2       The activist investor targeting Lowes Cos beli...
3       Bill Ackman said he sees significant upside fo...
4       While millennials might be all grown up a lot ...
5       After the strongest holidayshopping season in ...
6       As newly launched derivatives contracts draw p...
7       Im a sucker I admit it One of my failings is b...
8       RECENT decades have not been particularly good...
9       At just seven feet and seven inches wide the a...
10      Add Urban Outfitters Inc to the list of retail...
11      Americans credit card debt has just hit a dist...
12      Programming note Money Stuff will be off tomor...
13      American Tire Distributors Inc seeking to bols...
14      Retailer Gap Inc has named Neil Fiske as presi...
15      Elf Beauty plummeted 30 percent the most ever ...
16      Wall Street is commending Micron Technology In...
17      A few 

AttributeError: 'float' object has no attribute 'lower'

In [18]:
print(data['artText'][176])

Saving for retirement is hard enough but another difficult challenge is making sure the money lasts Invest your nest egg conservatively and you might not be able to stretch the money out especially if you live longer than you expect Putting more in the stock market could keep the pile growingbut once you stop adding money a streak of bad returns could decimate your stake and leave you little chance to recover For years many retirement experts have pointed to a potential solution annuities In its simplest form buying an annuity involves taking part of your savings and handing it over to an insurance company The insurer then pays out a guaranteed income for the rest of your life In theory putting at least part of retirement savings into an annuity can make sure you always have some income no matter what the markets do Even so annuities gained a bad reputation in some circles Many annuities are really investment products combined with insurance with the option of creating a stream of annu

In [14]:
data = loadData()


In [24]:
binEncoding(data)

Binary Encoding
1087
231
521
673
826
464
288
865
948
369
205
915
2795
591
391
836
536
1204
834
899
286
364
232
479
771
374
520
755
256
220
341
527
336
564
259
315
503
412
1152
479
76
244
928
263
445
286
1189
709
608
639
1045
97
271
371
1256
1043
998
526
465
2937
796
354
218
875
609
428
585
2146
695
326
450
1584
747
637
368
795
766
885
208
390
938
486
638
531
1103
608
410
469
648
4536
290
296
628
4401
245
1220
1015
1272
434
1026
387
171
1795
834
944
405
198
998
993
848
709
834
86
458
909
617
218
1007
264
86
740
2680
1356
385
611
1003
751
460
324
224
781
527
284
989
746
636
826
442
367
504
740
439
364
840
576
540
885
253
594
662
282
811
608
675
559
818
1131
428
1131
280
308
578
761
86
489
714
918
86
86
86
240
864
86
86
86
86
858
86
86
822
823
331
598
1214
1467
86
231
455
661
362
136
224
539
583
823
314
191
852
776
806
157
368
405
822
223
541
507
793
722
637
109
649
469
399
1534
407
706
935
972
928
549
777
508
209
241
901
351
856
561
419
873
1032
670
334
432
366
402
454
1233
422
604
1061


1855
997
1918
1141
370
589
1058
1171
920
215
732
878
1227
340
926
848
3860
787
2445
214
341
313
298
298
876
298
1028
680
2540
2540
2540
213
460
499
283
917
301
285
2726
285
2367
180
271
2909
510
999
298
254
693
461
268
322
358
266
213
289
291
284
1022
849
325
2163
271
308
220
982
1123
467
888
202
2183
272
159
586
518
3505
858
1048
452
3764
759
455
942
1059
318
284
259
461
562
301
1104
327
1162
375
1582
326
903
207
253
847
555
503
1346
1222
768
747
747
918
918
635
5408
1231
927
789
2179
466
945
1015
543
2015
511
2226
1525
2720
933
755
460
670
906
808
718
620
451
1082
239
512
512
611
545
821
753
456
4766
1046
560
991
540
675
474
546
1066
762
213
251
609
86
556
956
147
234
234
913
921
829
917
764
973
815
885
1290
759
711
908
509
723
983
617
623
982
786
591
931
1070
632
258
665
451
718
845
582
611
946
2436
795
197
867
158
316
254
167
481
1006
766
846
992
884
866
771
573
668
990
218
903
698
2973
673
861
851
708
929
656
348
719
773
236
819
160
256
2096
2096
2157
398
509
403
674
933
291
438
6

index,0,1,retailers,stores,brands,sales,gap,retail,store,brand,...,two,rate,larger,korea,mean,amp,issue,real,tariffs,building
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
1,0,0,0,1,0,1,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,0,1,1,0,0,1,1,1,0,0,...,0,0,1,0,0,1,0,0,0,0
3,0,1,0,1,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
5,0,1,1,1,0,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9,0,1,0,0,0,0,1,0,0,0,...,1,0,1,0,0,0,0,0,0,1
