In [1]:
#Get features (stops words removed) by tokenizing corpus - no stemming in baseline
#Binary encoding
#Assign target group 
#Use mutual information to get final feature set
#baseline

In [13]:
import os
import re
from pathlib import Path
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.feature_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import datasets
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm

In [14]:
# Testing Feature Selection
import nltk

## Download Resources
nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")

from nltk.stem import *

# download required resources
nltk.download("wordnet")

# we'll compare two stemmers and a lemmatizer
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
prStem = PorterStemmer()
wnLemm = WordNetLemmatizer()
def wnLemm_v(word):
    wnLemm = WordNetLemmatizer()
    word = wnLemm.lemmatize(word, 'v')
    return word

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Padmanie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Padmanie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Padmanie\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Padmanie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Padmanie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def importData():
    #Import Labelled Data
    DATA_DIR = "Data"
    thispath = Path().absolute()
    #dtype = {"index": str, "title": str, "description": str, "url": str, "date": str, "Retail Relevance": str, "Economy Relevant": str, "Market moving": str}
    RET_ARTICLES = os.path.join(DATA_DIR, "Labelled_Articles_.xlsx")
    
    df = pd.read_excel(RET_ARTICLES)

    try:
        df.head()
    except:
        pass
    return df

In [5]:
def assignStopWords(): 
    #Stop_words list Options
    #Variation 1: added stop words starting at 'one'
    stop_words = stopwords = [
        # dates/times
        "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "jan", "feb","mar", "apr", "jun", "jul", "aug", "oct", "nov", "dec", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "morning", "evening",
        # symbols that don't separate a sentence
        '$','“','”','’','—',
        # specific article terms that are useless
        "read", "share", "file", "'s","i", "photo", "percent","s", "t", "inc.", "corp", "group", "inc", "corp.", "source", "bloomberg", "cnbc","cnbcs", "cnn", "reuters","bbc", "published", "broadcast","york","msnbc",
        # other useless terms
        "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "co", "inc", "com", "theyve", "theyre", "theres", "heres", "didnt", "wouldn", "couldn", "didn","nbcuniversal","according", "just", "us", "ll", "times"#,
        # etc
        "from","the", "a", "with", "have", "has", "had", "having", "hello", "welcome", "yeah", "wasn", "today", "etc", "ext","definitely", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "while", "of", "said", "by", "for", "about", "into", "through", "during", "before", "after", "to", "from", "in", "out", "with", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "just", "don", "now", "will"
        ]
    #from nltk.corpus import stopwords
    #stop_words = set(stopwords.words('english'))
    #print(stop_words)
    return stop_words

In [6]:
def corpus_count_words(df, stop_words, text_col = 'content', normalizer=None):
    tokenizer = RegexpTokenizer(r'\w+')
    word_counter = Counter()
    for row in df.itertuples(index=True, name='Pandas'):
            attribute = str((row, text_col))
            file_words = tokenizer.tokenize(attribute)
            #keep lowercased words that are not stop words as features
            file_wordsNS = [word.lower() for word in file_words if not word.lower() in stop_words]
            # remove words that are numbers
            file_wordsN = [word for word in file_wordsNS if not word.isnumeric()]
            #remove words with a word length less than 4 (i.e. 1-3)
            file_wordsF = [word for word in file_wordsN if not len(word)<4]
            
            #stem
            if normalizer:
                file_wordsF = [normalizer(word) for word in file_wordsF]
            
            word_counter.update(file_wordsF)
    return word_counter

In [7]:
#Binary encoding for features, also appends retail target group
def binary_encode_features(newsarticles, top_words, text_col = 'content', normalizer=None):
    tokenizer = RegexpTokenizer(r'\w+')
    df_rows = []
    for row in tqdm(newsarticles.itertuples(index=True, name='Pandas')):
            attribute = str((row, text_col))
            file_words = tokenizer.tokenize(attribute)
            if normalizer:
                file_words = [normalizer(word) for word in file_words]
            df_rows.append([1 if word.lower() in file_words else 0 for word in top_words])      
    X = pd.DataFrame(df_rows, columns = top_words)
    
    return X

In [8]:
def mutualInformation(B_Encoding, y, top_words): 
    #Estimate mutual information for a discrete target variable.
    #Mutual information (MI) [1] between two random variables is a non-negative value, which measures the dependency between the variables.
    #It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
    featureVals= mutual_info_classif(B_Encoding, y, discrete_features='auto', n_neighbors=3, copy=True, random_state=None)
    
    np.asarray(featureVals)

    Temp= pd.DataFrame(featureVals, columns = ['MI_Values'])
 
    Final = Temp.assign(target_group = top_words)
    
    Highest_Features = Final.nlargest(10000, 'MI_Values')
    
    return Highest_Features

In [9]:
def selectFeatures(text_col = 'content', **kwargs):
    df = importData()
    df.columns
    stop_words = assignStopWords()
    
    if ('norm' in kwargs):
        norm = kwargs['norm']
        normalizers = {'lrStem' : lrStem.stem,
                       'sbStem' : sbStem.stem,
                       'prStem' : prStem.stem,
                       'wnLemm' : wnLemm.lemmatize,
                       'wnLemm-v':wnLemm_v,
                       'baseline':None
                      }
        normalizer = normalizers[norm]
    
    #Select subset of orig data
    #print(df.head(2))
    df1 = df[[text_col,'market_moving']]    
    news_cnt = corpus_count_words(df1, stop_words, text_col = text_col, normalizer = normalizer)
    
    print("starting Binary Encoding")
    num_features = 1000
    top_words = [word for (word, freq) in news_cnt.most_common(num_features)]
    B_Encoding = binary_encode_features(df1, top_words, text_col = text_col, normalizer = normalizer)
    print(B_Encoding.head())
    y = df['market_moving']
    B_Encoding.assign(target_group=y)
      
    print("Finished Bin Encoding. Collecting Highest Features")
    Highest_Features = mutualInformation(B_Encoding, y, top_words)
    Highest_Features = pd.DataFrame(Highest_Features)
    
    # Save as csv file in DATACOLLECTION data folder (bc it's needed for encoding script)
    if ('csv' in kwargs) and (kwargs['csv']):
        
        # File path for this file
        file_name = norm + text_col + 'FeatureSet.csv'
        thispath = Path().absolute()
        OUTPUT_DIR = os.path.join(thispath, "Data", file_name)
        
        # if the following line throws an error, use the line after to save in same folder
        pd.DataFrame.to_csv(Highest_Features, path_or_buf=OUTPUT_DIR)
        #pd.DataFrame.to_csv(Highest_Features, path_or_buf=file_name)
    
    print(Highest_Features)
    return Highest_Features

In [10]:
def main():
    HF = selectFeatures(csv = True, )
    return HF

In [11]:
nrms = ['wnLemm-v', 'lrStem', 'sbStem', 'prStem', 'wnLemm']
txtcols = ['title', 'content']

for txtcol in txtcols:
    #for nrm in nrms:
    nrm = nrms[4]
    print(txtcol + ': ' + nrm)
    HF = selectFeatures(text_col = txtcol, norm = 'baseline', csv=True, )
    

title: wnLemm
starting Binary Encoding


3330it [00:02, 1131.04it/s]


   title  index  pandas  market_moving  amazon  tech  stocks  apple  wall  \
0      1      0       0              1       0     0       0      0     0   
1      1      0       0              1       0     0       0      0     0   
2      1      0       0              1       0     0       0      0     0   
3      1      0       0              1       0     0       0      0     0   
4      1      0       0              1       0     0       0      0     0   

   street   ...     summer  simple  warehouse  teams  atul  gawande  every  \
0       0   ...          0       0          0      0     0        0      0   
1       0   ...          0       0          0      0     0        0      0   
2       0   ...          0       0          0      0     0        0      0   
3       0   ...          0       0          0      0     0        0      0   
4       0   ...          0       0          0      0     0        0      0   

   cctv  script  playing  
0     0       0        0  
1     0       

3330it [00:49, 67.62it/s]


   year  company  content  amazon  would  companies  market  like  people  \
0     1        1        1       0      1          0       0     1       1   
1     1        0        1       0      1          0       1     0       0   
2     1        1        1       0      1          0       0     0       1   
3     1        1        1       1      1          1       0     1       1   
4     0        0        1       1      1          1       0     0       1   

   says   ...     leaving  continues  opening  shop  german  expects  pace  \
0     0   ...           0          0        0     0       0        0     0   
1     1   ...           0          0        0     0       0        0     0   
2     0   ...           0          0        0     0       0        0     0   
3     0   ...           0          0        0     0       0        0     0   
4     0   ...           0          0        0     0       0        0     0   

   faster  review  friends  
0       0       0        1  
1       0 

In [12]:
Highest_Features = main()

# Let Paddy know the code is done:
import winsound
duration = 1000  # millisecond
freq = 440  # Hz
winsound.Beep(freq, duration)
winsound.Beep(600, 500)
winsound.Beep(freq, duration)
winsound.Beep(600, 500)

UnboundLocalError: local variable 'normalizer' referenced before assignment

In [None]:
featureSet = pd.DataFrame(Highest_Features['target_group'])
    
# Save as csv file in DATACOLLECTION data folder (bc it's needed for encoding script)


# File path for this file
file_name = 'retailFeatureSet.csv'
thispath = Path().absolute()
OUTPUT_DIR = os.path.join(thispath, "Data", file_name)

# if the following line throws an error, use the line after to save in same folder
pd.DataFrame.to_csv(featureSet, path_or_buf=OUTPUT_DIR)

In [None]:
import matplotlib.pyplot as plt
plt.plot(Highest_Features['MI_Values'].values)
plt.ylabel('MI Score')
plt.axis([0, 250, 0, 0.16])
plt.show()

In [None]:
Highest_Features['MI_Values'].values

In [None]:
Highest_Features