# Feature Encoding

The following script formats articles by features for linear regression (first attempt).  
It takes in a list of features and a set of articles, converts to lowercase and creates an encoded matrix (dense)  
While this isn't the cleverest method, it provides a usable input for setting up our initial linear regression code.

### Limitations:
* Proper Nouns should keep their capitals
* Punctuation/Stemming etc not incorporated
* Bi-grams not accommodated
* Could be converted to space matrix
* No log function incorporated at this point
    


In [63]:
#importing libraries
import pandas as pd
import numpy as np
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import os
from pathlib import Path

In [64]:
#import pyscripter

#relevant_nbs = ['FeatureSelection.ipynb']
#relevant_nbs = pyscripter.nb_to_py(relevant_nbs)
#print("y print 2x?")
#print(relevant_nbs)
#pyscripter.import_scripts(['FeatureSelection.py'])


In [65]:
def loadData():
    DATA_DIR = "Data"
    FEATURES_DIR = os.path.join(DATA_DIR, "retailFeatureSet.csv")
    ARTICLES_DIR = os.path.join(DATA_DIR, "retailarticles YTD (new)_merged.csv")
    
    fts = pd.read_csv(FEATURES_DIR)
    for col in fts.columns:
        if not (col.strip() == 'target_group'):
            fts = fts.drop([col], axis = 1)
    fts.columns = ['index']
    fts['index'] = list(map(lambda x: x.strip(), fts['index']))
    arts = pd.read_csv(ARTICLES_DIR)
    artText = arts.iloc[:,5]
    data = {'fts':fts, 'artText': artText}
    return data

In [100]:
def binEncoding(data):
    print("Binary Encoding")
    fts = data['fts']
    artText = data['artText']
    
    df_rows = []
    tokenizer = RegexpTokenizer(r'\w+')

    for art in artText:
        if type(art) == str: 
            body = art.lower()
            #body = clean_file_text(body)
            art_words = tokenizer.tokenize(body)
            df_rows.append([1 if word in art_words else 0 for word in fts['index']])
        else:
            df_rows.append([0 for word in fts['index']])
    X = pd.DataFrame(df_rows, columns = fts['index'].values)
    return X

In [101]:
def tfEncoding(data):
    print("tf Encoding")
    fts = data['fts']
    artText = data['artText']
    
    tf_rows = []
    for art in artText:
        if type(art) == str:
            body = art.lower()
            body = body.split()
            wordsCounter = Counter(body)
            tf_rows.append([wordsCounter[word] if word in wordsCounter else 0 for word in fts['index']])
        else:
            tf_rows.append([0 for word in fts['index']])
    X = pd.DataFrame(tf_rows, columns = fts['index'].values)  
    return X

In [68]:
data = loadData()
fts = data['fts']
fts['index'].values

array(['0', '1', 'stores', 'retailers', 'brands', 'company', 'store',
       'sales', 'brand', 'gap', 'retail', 'economic', 'companys',
       'online', 'government', 'shares', 'google', 'since', 'however',
       'economy', 'employers', 'reported', 'asked', 'markets', 'law',
       'gt', 'instead', 'finance', 'largely', 'committee', 'age', 'area',
       'consumer', 'quarter', '2020', 'data', 'harassment', 'comes',
       'inflation', 'steel', 'rich', 'bad', 'budget', 'jobs', 'risks',
       'software', 'cant', 'reason', 'levels', 'revenue', 'provide',
       'york', 'male', 'increases', 'pension', 'show', 'opportunities',
       'policies', 'governments', 'treasury', 'come', 'less',
       'university', 'minister', 'amount', 'elections', 'lead', 'goldman',
       'economist', 'administration', 'customers', 'tend', 'spent',
       'womens', 'already', 'policy', 'smaller', 'follow', 'work', 'step',
       'western', 'role', 'monetary', 'cities', 'profits', 'announced',
       'market',

In [80]:
def tfidfEncoding(data):
    print("tifidf Encoding")
    fts = data['fts']

    # Base calculations
    binX = binEncoding(data)
    tfX = tfEncoding(data)
    
    # Calculate idf
    df_row = [binX[word].sum() for word in fts['index']]
    idf = [1/(df+1) for df in df_row]
    #transpose list (not the cleverest method)
    idf_row = []
    idf_row.append(idf)
    idf_list = pd.DataFrame(idf_row, columns = fts['index'])
    
    # Extract term frequencies
    tf = tfX.values
    # Set up loop to multiply each article (row) by the idf per term (col)
    tf_idf = []
    r, c = tf.shape
    for art in range(0,r):
        tf_idf.append(tf[art]*idf)
    tf_idf = pd.DataFrame(tf_idf, columns = fts['index'])
    X = tf_idf
    return X

In [97]:
print("tifidf Encoding")
data = loadData()
fts = data['fts']

# Base calculations
binX = binEncoding(data)
tfX = tfEncoding(data)

# Calculate idf
#df_row = [binX[word].sum() for word in fts['index']]
#idf = [1/(df+1) for df in df_row]



tifidf Encoding
tf Encoding


In [99]:
for word in fts['index']:
    print(tfX[word].sum())

34
1130
597
476
512
3619
365
1513
426
5461
639
2565
723
769
3391
1000
621
3392
1201
2434
545
840
595
2083
897
600
583
922
399
493
608
452
683
1018
425
2783
434
889
1370
396
527
486
1348
1227
577
383
595
578
682
860
668
845
624
392
525
892
382
650
829
606
1401
2452
1009
1006
474
435
618
409
602
783
706
382
439
470
1383
2068
458
388
2547
426
358
696
638
537
474
650
4555
3813
1015
459
433
777
591
3953
430
689
1234
1223
840
596
942
2042
486
919
424
360
523
455
731
3488
3573
772
883
478
466
675
1022
579
803
787
1692
683
2024
589
812
1683
1346
721
854
403
540
798
473
471
3068
1854
1220
941
487
738
2244
382
535
963
548
429
1185
379
500
6862
452
1506
514
467
577
436
1256
447
541
794
680
993
1052
449
859
415
477
397
4421
2821
675
1584
500
422
350
357
1548
1676
423
821
643
539
362
741
949
1131
388
574
517
1332
1269
352
1454
673
409
433
758
1613
419
733
508
367
1202
515
1553
487
352
627
601
1714
362
529
367
601
414
505
1342
2096
1083
409
1165
1669
2923
1387
589
443
462
383
5434
566
404
4339
351
4

In [81]:
def encoding(encodeType, **kwargs):
    # 0 for Binary Encoding
    # 1 for Term Frequency Encoding
    # 2 for TF-IDF Encoding
    # If you'd like to save as csv, use "csv = True"
        
    # Load up data
    data = loadData()
    
    # Run corresponding encoding type and pass data
    options = {0 : binEncoding,
                1 : tfEncoding,
                2 : tfidfEncoding,}
    
    X = options[encodeType](data)
    
    # Save as csv file in CLASSIFICATION data folder =)
    if ('csv' in kwargs) and (kwargs['csv']):
        
        # File path for this file
        file_name = options[encodeType].__name__ + '.csv'
        thispath = Path().absolute()
        OUTPUT_DIR = os.path.join(thispath.parent.parent, "Classification", "Data", file_name)
        # if the following line throws an error, use the line after to save in same folder
        pd.DataFrame.to_csv(X, path_or_buf=OUTPUT_DIR)
        #pd.DataFrame.to_csv(X, path_or_buf=file_name)
    
    # Return Panda DataFrame
    return X
    


def main(): # Stuff to do when run from the command line    
    encoding(0, csv = True)
    pass  
 


In [104]:
#testcell

X = encoding(0, csv=True)
X.head()

Binary Encoding
C:\Users\Padmanie\Documents\GitHub\Capstone\Classification\Data\binEncoding.csv


Unnamed: 0,0,1,stores,retailers,brands,company,store,sales,brand,gap,...,october,competition,political,major,theyre,agency,review,ms,following,measure
0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0
1,0,0,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
