In [1]:
import re
import nltk
import numpy as np
import collections
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/sriram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sriram/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
with open('train_5500.label.txt', 'rb') as f:
    data = f.readlines()

In [3]:
def clean_text(text):
    text = str(text)
    temp = text.split(":")
    label = temp[0][2:]
    text = (" ").join(temp[1].split()[1:])
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return(text[:-2].lower(), label)

In [4]:
line = str(data[0])

temp = line.split(":")

In [5]:
temp[0][2:]

'DESC'

In [6]:
(" ").join(temp[1].split()[1:])

"How did serfdom develop in and then leave Russia ?\\n'"

In [7]:
X_train = []
Y_train = []
for line in data:
    sent, lab = clean_text(str(line))
    X_train.append(sent)
    Y_train.append(lab)

In [8]:
Y_train[0]

'DESC'

In [9]:
from nltk.util import ngrams

In [10]:
def generate_N_grams(words,ngram=1, freq=500): 
    res= ngrams(words,ngram)
    mostFreq = collections.Counter(res)
    ans=mostFreq.most_common(freq)
    return ans

In [11]:
words= []
data = []
length=[]
lexical=[]
syntactic = []
for sent, label in zip(X_train, Y_train):
    data.append(sent)
    length.append(len(sent.split()))
    words.extend([word for word in sent.split(" ") if word not in set(stopwords.words('english'))])

In [12]:
top_ngrams = [w[0][0] for w in generate_N_grams(words,1,500)]

In [13]:
for sent, label in zip(X_train, Y_train):
    wordlist = [word for word in sent.split(" ") if word in top_ngrams]
    lexical.append(wordlist)
    syntactic.append(nltk.pos_tag(wordlist))

In [14]:
# len(lexical[0])

In [15]:
import pandas as pd

df=pd.DataFrame()
df['sentence']=data
df['length']=length
# df['lexical']=lexical
df['lexical']=[len(x) for x in lexical]
# df['syntactic']=syntactic
df['syntactic']=[len(x) for x in syntactic]
df['label'] = Y_train
df.head()

Unnamed: 0,sentence,length,lexical,syntactic,label
0,how did serfdom develop in and then leave russia,9,0,0,DESC
1,what films featured the character popeye doyle,7,2,2,ENTY
2,how can i find a list of celebrities real names,10,4,4,DESC
3,what fowl grabs the spotlight after the chines...,12,2,2,ENTY
4,what is the full form of com,7,2,2,ABBR


In [16]:
def gini_impurity(y):
    '''
    Given a Pandas Series, it calculates the Gini Impurity. 
    y: variable with which calculate Gini Impurity.
    '''
    if isinstance(y, pd.Series):
        p = y.value_counts()/y.shape[0]
        gini = 1-np.sum(p**2)
        return(gini)

    else:
        raise('Object must be a Pandas Series.')

gini_impurity(df.length) 

0.9126174998075649

In [17]:
def entropy(y):
    '''
    Given a Pandas Series, it calculates the entropy. 
    y: variable with which calculate entropy.
    '''
    if isinstance(y, pd.Series):
        a = y.value_counts()/y.shape[0]
        entropy = np.sum(-a*np.log2(a+1e-9))
        return(entropy)

    else:
        raise('Object must be a Pandas Series.')

entropy(df.length)  

3.765096214502992

In [18]:
def variance(y):
    '''
    Function to help calculate the variance avoiding nan.
    y: variable to calculate variance to. It should be a Pandas Series.
    '''
    if(len(y) == 1):
        return 0
    else:
        return y.var()

def information_gain(y, mask, func=entropy):
    '''
    It returns the Information Gain of a variable given a loss function.
    y: target variable.
    mask: split choice.
    func: function to be used to calculate Information Gain in case os classification.
    '''

    a = sum(mask)
    b = mask.shape[0] - a

    if(a == 0 or b ==0): 
        ig = 0

    else:
        if y.dtypes != 'O':
            ig = variance(y) - (a/(a+b)* variance(y[mask])) - (b/(a+b)*variance(y[-mask]))
        else:
            ig = func(y)-a/(a+b)*func(y[mask])-b/(a+b)*func(y[-mask])

    return ig

In [19]:
import itertools

def categorical_options(a):
    '''
    Creates all possible combinations from a Pandas Series.
    a: Pandas Series from where to get all possible combinations. 
    '''
    a = a.unique()

    opciones = []
    for L in range(0, len(a)+1):
        for subset in itertools.combinations(a, L):
            subset = list(subset)
            opciones.append(subset)

    return opciones[1:-1]

def max_information_gain_split(x, y, func=entropy):
    '''
    Given a predictor & target variable, returns the best split, the error and the type of variable based on a selected cost function.
    x: predictor variable as Pandas Series.
    y: target variable as Pandas Series.
    func: function to be used to calculate the best split.
    '''

    split_value = []
    ig = [] 

    numeric_variable = True if x.dtypes != 'O' else False

    # Create options according to variable type
    if numeric_variable:
        options = x.sort_values().unique()[1:]
    else: 
        options = categorical_options(x)

    # Calculate ig for all values
    for val in options:
        mask =   x < val if numeric_variable else x.isin(val)
        val_ig = information_gain(y, mask, func)
        # Append results
        ig.append(val_ig)
        split_value.append(val)

    # Check if there are more than 1 results if not, return False
    if len(ig) == 0:
        return(None,None,None, False)

    else:
        # Get results with highest IG
        best_ig = max(ig)
        best_ig_index = ig.index(best_ig)
        best_split = split_value[best_ig_index]
        return(best_ig,best_split,numeric_variable, True)


    weight_ig, weight_slpit, _, _ = max_information_gain_split(df['length'], df['label'],)  


    print(
    "The best split for Weight is when the variable is less than ",
    weight_slpit,"\nInformation Gain for that split is:", weight_ig
    )

In [20]:
df.drop(['label', 'sentence'], axis= 1).apply(max_information_gain_split, y = df['label'])

Unnamed: 0,length,lexical,syntactic
0,0.054908,0.056198,0.056198
1,6,2,2
2,True,True,True
3,True,True,True


In [21]:
max_information_gain_split(x=df["length"], y=df["label"])

(0.05490794663614973, 6, True, True)

In [22]:
max_information_gain_split(x=df["lexical"], y=df["label"])

(0.05619806339625466, 2, True, True)

In [23]:
def get_best_split(y, data):
    '''
    Given a data, select the best split and return the variable, the value, the variable type and the information gain.
    y: name of the target variable
    data: dataframe where to find the best split.
    '''
    masks = data.drop(y, axis= 1).apply(max_information_gain_split, y = data[y])
    if sum(masks.loc[3,:]) == 0:
        return(None, None, None, None)

    else:
        # Get only masks that can be splitted
        masks = masks.loc[:,masks.loc[3,:]]

        # Get the results for split with highest IG
        split_variable = max(masks)
        #split_valid = masks[split_variable][]
        split_value = masks[split_variable][1] 
        split_ig = masks[split_variable][0]
        split_numeric = masks[split_variable][2]

    return(split_variable, split_value, split_ig, split_numeric)


def make_split(variable, value, data, is_numeric):
    '''
    Given a data and a split conditions, do the split.
    variable: variable with which make the split.
    value: value of the variable to make the split.
    data: data to be splitted.
    is_numeric: boolean considering if the variable to be splitted is numeric or not.
    '''
    if is_numeric:
        data_1 = data[data[variable] < value]
        data_2 = data[(data[variable] < value) == False]

    else:
        data_1 = data[data[variable].isin(value)]
        data_2 = data[(data[variable].isin(value)) == False]

    return(data_1,data_2)

def make_prediction(data, target_factor):
    '''
    Given the target variable, make a prediction.
    data: pandas series for target variable
    target_factor: boolean considering if the variable is a factor or not
    '''

    # Make predictions
    if target_factor:
        pred = data.value_counts().idxmax()
    else:
        pred = data.mean()

    return pred

In [24]:
def train_tree(data,y, target_factor, max_depth = None,min_samples_split = None, min_information_gain = 1e-20, counter=0, max_categories = 20):
  '''
  Trains a Decission Tree
  data: Data to be used to train the Decission Tree
  y: target variable column name
  target_factor: boolean to consider if target variable is factor or numeric.
  max_depth: maximum depth to stop splitting.
  min_samples_split: minimum number of observations to make a split.
  min_information_gain: minimum ig gain to consider a split to be valid.
  max_categories: maximum number of different values accepted for categorical values. High number of values will slow down learning process. R
  '''

  # Check that max_categories is fulfilled
  if counter==0:
    types = data.dtypes
    check_columns = types[types == "object"].index
    for column in check_columns:
      var_length = len(data[column].value_counts()) 
      if var_length > max_categories:
        raise ValueError('The variable ' + column + ' has '+ str(var_length) + ' unique values, which is more than the accepted ones: ' +  str(max_categories))

  # Check for depth conditions
  if max_depth == None:
    depth_cond = True

  else:
    if counter < max_depth:
      depth_cond = True

    else:
      depth_cond = False

  # Check for sample conditions
  if min_samples_split == None:
    sample_cond = True

  else:
    if data.shape[0] > min_samples_split:
      sample_cond = True

    else:
      sample_cond = False

  # Check for ig condition
  if depth_cond & sample_cond:

    var,val,ig,var_type = get_best_split(y, data)

    # If ig condition is fulfilled, make split 
    if ig is not None and ig >= min_information_gain:

      counter += 1

      left,right = make_split(var, val, data,var_type)

      # Instantiate sub-tree
      split_type = "<=" if var_type else "in"
      question =   "{} {}  {}".format(var,split_type,val)
      # question = "\n" + counter*" " + "|->" + var + " " + split_type + " " + str(val) 
      subtree = {question: []}


      # Find answers (recursion)
      yes_answer = train_tree(left,y, target_factor, max_depth,min_samples_split,min_information_gain, counter)

      no_answer = train_tree(right,y, target_factor, max_depth,min_samples_split,min_information_gain, counter)

      if yes_answer == no_answer:
        subtree = yes_answer

      else:
        subtree[question].append(yes_answer)
        subtree[question].append(no_answer)

    # If it doesn't match IG condition, make prediction
    else:
      pred = make_prediction(data[y],target_factor)
      return pred

   # Drop dataset if doesn't match depth or sample conditions
  else:
    pred = make_prediction(data[y],target_factor)
    return pred

  return subtree


max_depth = 5
min_samples_split = 20
min_information_gain  = 1e-5


decisiones = train_tree(data,'obese',True, max_depth,min_samples_split,min_information_gain)


decisiones

AttributeError: 'list' object has no attribute 'dtypes'