In [2]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from nltk.tag.crf import CRFTagger
from nltk.corpus import brown
import spacy
from sklearn.feature_extraction.text import CountVectorizer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
%pip install python-crfsuite



# Data Loading

In [4]:
col_names = ['questions', 'a', 'b']
data_df = pd.read_csv("https://raw.githubusercontent.com/VIthulan/travel-text-classification/master/data/5000TravelQuestionsDataset.csv", error_bad_lines=False,header=None, names=col_names, encoding='latin-1')


In [5]:
data_df['questions']

0       What are the special things we (husband and me...
1       What are the companies which organize shark fe...
2       Is it safe for female traveller to go alone to...
3       What are the best places around Cape Town for ...
4       What are the best places to stay for a family ...
                              ...                        
4995    What is the best area to be based for sightsee...
4996    What are the good value traditional bars and r...
4997       What are the hotels near Alicante bus station?
4998       Where to stay in La Gomera to mountain biking?
4999    Is it possible to take a train trip from Santi...
Name: questions, Length: 5000, dtype: object

# Pre Processing

In [6]:
    # Remove all the special characters
data_df['processed_questions'] = data_df['questions'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_questions'] = data_df['processed_questions'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_questions'] = data_df['questions'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_questions'] = data_df['questions'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_questions'] = data_df['questions'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_questions'] = data_df['questions'].str.strip()

In [7]:
data_df['processed_questions']

0       What are the special things we (husband and me...
1       What are the companies which organize shark fe...
2       Is it safe for female traveller to go alone to...
3       What are the best places around Cape Town for ...
4       What are the best places to stay for a family ...
                              ...                        
4995    What is the best area to be based for sightsee...
4996    What are the good value traditional bars and r...
4997       What are the hotels near Alicante bus station?
4998       Where to stay in La Gomera to mountain biking?
4999    Is it possible to take a train trip from Santi...
Name: processed_questions, Length: 5000, dtype: object

## Lemmatizing

In [8]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lem = [lemmatizer.lemmatize(w, pos="v") for w in nltk.word_tokenize(text)]
  return " ".join(lem)

In [10]:
data_df["question_lemmatized"] = data_df.processed_questions.apply(lemmatize_text)

In [11]:
data_df["question_lemmatized"]

0       What be the special things we ( husband and me...
1       What be the company which organize shark feed ...
2       Is it safe for female traveller to go alone to...
3       What be the best place around Cape Town for sa...
4       What be the best place to stay for a family to...
                              ...                        
4995    What be the best area to be base for sightsee ...
4996    What be the good value traditional bar and res...
4997       What be the hotels near Alicante bus station ?
4998        Where to stay in La Gomera to mountain bike ?
4999    Is it possible to take a train trip from Santi...
Name: question_lemmatized, Length: 5000, dtype: object

## POS Tag

In [12]:
def pos_tagger(text):
    pos_tagged = [ r[1] for r in pos_tag(nltk.word_tokenize(text))] 
    return ' '.join(pos_tagged)

In [13]:
data_df["question_pos_t"] = data_df.processed_questions.apply(pos_tagger)

In [14]:
# brown_tags = brown.tagged_sents(categories='hobbies')
#  crf = CRFTagger()
# crf.train(brown_tags,'model.crf.tagger')

In [15]:
data_df["question_pos_t"]

0       WP VBP DT JJ NNS PRP ( NN CC PRP ) MD VB IN DT...
1             WP VBP DT NNS WDT VBP NN NN NNS IN NN NNS .
2               VBZ PRP JJ IN JJ NN TO VB RB TO NNP NNP .
3                    WP VBP DT JJS NNS IN NNP NNP IN NN .
4       WP VBP DT JJS NNS TO VB IN DT NN TO VB RB IN NN .
                              ...                        
4995           WP VBZ DT JJS NN TO VB VBN IN VBG IN NNP .
4996               WP VBP DT JJ NN JJ NNS CC NNS IN NNP .
4997                         WP VBP DT NNS IN NNP NN NN .
4998                      WRB TO VB IN NNP NNP TO VB NN .
4999            VBZ PRP JJ TO VB DT NN NN IN NNP IN NNP .
Name: question_pos_t, Length: 5000, dtype: object

## Headword extraction

In [16]:
nlp = spacy.load("en_core_web_sm")
def head_word_tokenizer(text):
    head_words = []
    for token in nlp(text):
        if token.dep_ == "nsubj" or token.dep_ == "nsubjpass":
            head_words.append(token.text)
            head_words.append(token.head.text)
    return head_words

In [17]:
# head_words_vectorizer = CountVectorizer(tokenizer = head_word_tokenizer,max_features=100,stop_words=stopwords.words('english'))
# head_words_vector = head_words_vectorizer.fit_transform(data_df.question_lemmatized.values).toarray()

In [18]:
data_df["question_headwords"] = data_df.processed_questions.apply(head_word_tokenizer)

In [19]:
data_df["question_headwords"]

0                   [things, are, we, do]
1       [companies, are, which, organize]
2                 [it, Is, traveller, go]
3                           [places, are]
4             [places, are, family, stay]
                      ...                
4995                           [area, is]
4996                          [bars, are]
4997                        [hotels, are]
4998                                   []
4999                             [it, Is]
Name: question_headwords, Length: 5000, dtype: object

## Headword Synonyms

In [20]:
from nltk.corpus import wordnet

def wordnet_synonyms(keywords):
  synonyms = []
  for keyword in keywords:
    for synset in wordnet.synsets(keyword):
      for lemma in synset.lemmas():
          synonyms.append(lemma.name())

  return synonyms

In [21]:
wordnet_synonyms(["mother", "father"])

['mother',
 'female_parent',
 'mother',
 'mother',
 'mother',
 'mother',
 'mother',
 'fuss',
 'overprotect',
 'beget',
 'get',
 'engender',
 'father',
 'mother',
 'sire',
 'generate',
 'bring_forth',
 'father',
 'male_parent',
 'begetter',
 'forefather',
 'father',
 'sire',
 'Father',
 'Padre',
 'Church_Father',
 'Father_of_the_Church',
 'Father',
 'father',
 'Father',
 'Father-God',
 'Fatherhood',
 'founder',
 'beginner',
 'founding_father',
 'father',
 'don',
 'father',
 'beget',
 'get',
 'engender',
 'father',
 'mother',
 'sire',
 'generate',
 'bring_forth']

In [22]:
data_df["question_hw_syn"] = data_df.question_headwords.apply(wordnet_synonyms)

In [23]:
data_df["question_hw_syn"]

0       [things, thing, thing, thing, thing, thing, ma...
1       [company, company, company, companionship, fel...
2       [information_technology, IT, be, be, be, exist...
3       [topographic_point, place, spot, place, proper...
4       [topographic_point, place, spot, place, proper...
                              ...                        
4995    [area, country, area, area, region, sphere, do...
4996    [parallel_bars, bars, barroom, bar, saloon, gi...
4997    [hotel, are, ar, be, be, be, exist, be, be, eq...
4998                                                   []
4999    [information_technology, IT, be, be, be, exist...
Name: question_hw_syn, Length: 5000, dtype: object

## Bag of Words
This will be added to the training model directly using countVector

# Vectorize

## TF IDF Vectorizer

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorize(text):
  tfidfconverter = TfidfVectorizer(max_features=1500, min_df=1, max_df=0.7, stop_words=stopwords.words('english'))
  X = tfidfconverter.fit_transform(text).toarray()
  return X


array([[0., 0.],
       [0., 0.],
       [1., 0.],
       [0., 1.]])

## Count Vectorizer

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
def count_vectorize(text):
  vectorizer = CountVectorizer(max_features=1500, min_df=1, max_df=0.7, stop_words=stopwords.words('english'))
  X = vectorizer.fit_transform(text).toarray()
  return X


array([[1, 1, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])