In [1]:
#Get features (stops words removed) by tokenizing corpus - no stemming in baseline
#Binary encoding
#Assign target group 
#Use mutual information to get final feature set

In [2]:
import os
import re
from pathlib import Path
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.feature_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import datasets
import matplotlib
import matplotlib.pyplot as plt

In [3]:
def importData():
    #Import Labelled Data
    DATA_DIR = "Data"
    thispath = Path().absolute()
    RET_ARTICLES = os.path.join(DATA_DIR, "retailarticles YTD (new)_merged.csv")

    df = pd.read_csv(RET_ARTICLES, encoding= "ISO-8859-1")

    try:
        df.head()
    except:
        pass
    return df

In [4]:
def assignStopWords(): 
    #Stop_words list Options
    stop_words = {'audio','i', 'me', 'us', 'my','myself','we','our','ours', 'ourselves','you', 'your', 'yours', 'yourself', 'yourselves','he',	 'him',	 'his',	 'himself',	 'she',	 'her',	 'hers',	 'herself',	 'it',	 'its',	 'itself',	 'they','them','their', 'theirs', 'themselves', 'what', 'which', 'who','whom', 'this', 'that', 'these', 'those',	 'am',	 'is',	 'are',	 'was',	 'were',	 'be',	 'been',	 'being',	 'have',	 'has',	 'had',	 'having',	 'do',	 'does',	 'did',	 'doing',	 'a',	 'an',	 'the',	 'and',	 'but',	 'if',	 'or',	 'because',	 'as',	 'until',	 'while',	 'of',	 'at',	 'by',	 'for',	 'with',	 'about',	 'into',	 'through',	 'during',	 'before',	 'after',	 'to',	 'from','up','down','in','out','on','off','over',	 'under',	 'again',	 'further',	 'then',	 'once',	 'here',	 'there',	 'when',	 'where',	 'why',	 'how',	 'all',	 'any',	 'both',	 'each',	 'few',	 'more',	 'most',	 'other',	 'some',	 'such',	 'no',	 'nor',	 'not',	 'only','own','same', 'so','than', 'too','very','s','t','can', 'will', 'just','don','should', 'now'}

    #from nltk.corpus import stopwords
    #stop_words = set(stopwords.words('english'))
    #print(stop_words)
    return stop_words

In [5]:
def corpus_count_words(df, stop_words):
    tokenizer = RegexpTokenizer(r'\w+')
    word_counter = Counter()
    for row in df.itertuples(index=True, name='Pandas'):
            attribute = str((row, 'content'))
            file_words = tokenizer.tokenize(attribute)
            #keep lowercased words that are not stop words as features
            file_wordsNS = [word.lower() for word in file_words if not word.lower() in stop_words]
            word_counter.update(file_wordsNS)
    return word_counter

In [6]:
# news_cnt = corpus_count_words(df1,stop_words)

In [7]:
# news_cnt.most_common(30)

In [8]:
#Binary encoding for features, also appends retail target group
def binary_encode_features(newsarticles, top_words):
    tokenizer = RegexpTokenizer(r'\w+')
    df_rows = []
    for row in newsarticles.itertuples(index=True, name='Pandas'):
            attribute = str((row, 'content'))
            file_words = tokenizer.tokenize(attribute)
            df_rows.append([1 if word.lower() in file_words else 0 for word in top_words])      
    X = pd.DataFrame(df_rows, columns = top_words)
    
    return X

In [9]:
def mutualInformation(B_Encoding, y, top_words): 
    #Estimate mutual information for a discrete target variable.
    #Mutual information (MI) [1] between two random variables is a non-negative value, which measures the dependency between the variables.
    #It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.
    featureVals= mutual_info_classif(B_Encoding, y, discrete_features='auto', n_neighbors=3, copy=True, random_state=None)
    
    np.asarray(featureVals)

    Temp= pd.DataFrame(featureVals, columns = ['MI_Values'])
 
    Final = Temp.assign(target_group = top_words)
    
    Highest_Features = Final.nlargest(250, 'MI_Values')
    
    return Highest_Features

In [10]:
#print(featureVals)

In [11]:
#Final.head()

In [15]:
def selectFeatures(**kwargs):
    df = importData()
    stop_words = assignStopWords()
    
    #Select subset of orig data
    df1 = df[['content','Retail Relevance']]    
    news_cnt = corpus_count_words(df1, stop_words)
    
    num_features = 1000
    top_words = [word for (word, freq) in news_cnt.most_common(num_features)]
    B_Encoding = binary_encode_features(df1, top_words)
    y = df['Retail Relevance']
    B_Encoding.assign(target_group=y)
      
    
    Highest_Features = mutualInformation(B_Encoding, y, top_words)
    Highest_Features = pd.DataFrame(Highest_Features)
    
    # Save as csv file in DATACOLLECTION data folder (bc it's needed for encoding script)
    if ('csv' in kwargs) and (kwargs['csv']):
        
        # File path for this file
        file_name = 'retailFeatureSet.csv'
        thispath = Path().absolute()
        OUTPUT_DIR = os.path.join(thispath, "Data", file_name)
        
        # if the following line throws an error, use the line after to save in same folder
        pd.DataFrame.to_csv(Highest_Features, path_or_buf=OUTPUT_DIR)
        #pd.DataFrame.to_csv(Highest_Features, path_or_buf=file_name)
    
    print(Highest_Features)
    return Highest_Features

In [16]:
def main():
    HF = selectFeatures(csv = True)
    return HF

In [17]:
Highest_Features = main()

     MI_Values target_group
27    0.284536            0
136   0.118413            1
542   0.056787       stores
719   0.055051    retailers
667   0.033363       brands
26    0.031589      company
945   0.028351        store
122   0.027331        sales
806   0.026988        brand
8     0.025493          gap
485   0.020142       retail
52    0.019644     economic
398   0.019521     companys
368   0.018498       online
36    0.018119   government
249   0.018093       shares
512   0.015814       google
35    0.015765        since
186   0.013918      however
58    0.013111      economy
618   0.012592    employers
319   0.012556     reported
545   0.012548        asked
67    0.012321      markets
291   0.012195          law
539   0.012077           gt
564   0.011662      instead
281   0.011467      finance
860   0.011447      largely
685   0.011418    committee
..         ...          ...
193   0.004121         five
99    0.004113          end
44    0.004095        still
146   0.004092    ex

In [80]:
print(pd.DataFrame(Highest_Features['target_group']))

      target_group
27               0
136              1
719      retailers
542         stores
667         brands
122          sales
8              gap
485         retail
945          store
806          brand
36      government
26         company
398       companys
368         online
176  international
319       reported
21             may
544        results
52        economic
227     management
654      questions
539             gt
121         gender
71       countries
256         chinas
716       starting
606         helped
68          policy
415      customers
62      investment
..             ...
753        receive
726       progress
613    significant
171         former
327    governments
949         summit
87         average
969       families
337           life
843   unemployment
946           drop
394          board
514           idea
130            yet
922        sectors
359        looking
484        started
962           room
642       november
312           hard
37          

In [81]:
featureSet = pd.DataFrame(Highest_Features['target_group'])
    
# Save as csv file in DATACOLLECTION data folder (bc it's needed for encoding script)


# File path for this file
file_name = 'retailFeatureSet.csv'
thispath = Path().absolute()
OUTPUT_DIR = os.path.join(thispath, "Data", file_name)

# if the following line throws an error, use the line after to save in same folder
pd.DataFrame.to_csv(featureSet, path_or_buf=OUTPUT_DIR)