In [36]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Loading

In [18]:
col_names = ['questions', 'a', 'b']
data_df = pd.read_csv("https://raw.githubusercontent.com/VIthulan/travel-text-classification/master/data/5000TravelQuestionsDataset.csv", error_bad_lines=False,header=None, names=col_names, encoding='latin-1')


In [31]:
data_df['questions']

0       What are the special things we (husband and me...
1       What are the companies which organize shark fe...
2       Is it safe for female traveller to go alone to...
3       What are the best places around Cape Town for ...
4       What are the best places to stay for a family ...
                              ...                        
4995    What is the best area to be based for sightsee...
4996    What are the good value traditional bars and r...
4997       What are the hotels near Alicante bus station?
4998       Where to stay in La Gomera to mountain biking?
4999    Is it possible to take a train trip from Santi...
Name: questions, Length: 5000, dtype: object

# Pre Processing

In [34]:
    # Remove all the special characters
data_df['processed_questions'] = data_df['questions'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_questions'] = data_df['processed_questions'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_questions'] = data_df['questions'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_questions'] = data_df['questions'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_questions'] = data_df['questions'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_questions'] = data_df['questions'].str.strip()

In [35]:
data_df['processed_questions']

0       What are the special things we (husband and me...
1       What are the companies which organize shark fe...
2       Is it safe for female traveller to go alone to...
3       What are the best places around Cape Town for ...
4       What are the best places to stay for a family ...
                              ...                        
4995    What is the best area to be based for sightsee...
4996    What are the good value traditional bars and r...
4997       What are the hotels near Alicante bus station?
4998       Where to stay in La Gomera to mountain biking?
4999    Is it possible to take a train trip from Santi...
Name: processed_questions, Length: 5000, dtype: object

## Lemmatizing

In [66]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lem = [lemmatizer.lemmatize(w, pos="v") for w in nltk.word_tokenize(text)]
  return " ".join(lem)

In [70]:
data_df["question_lemmatized"] = data_df.processed_questions.apply(lemmatize_text)

In [71]:
data_df["question_lemmatized"]

0       What be the special things we ( husband and me...
1       What be the company which organize shark feed ...
2       Is it safe for female traveller to go alone to...
3       What be the best place around Cape Town for sa...
4       What be the best place to stay for a family to...
                              ...                        
4995    What be the best area to be base for sightsee ...
4996    What be the good value traditional bar and res...
4997       What be the hotels near Alicante bus station ?
4998        Where to stay in La Gomera to mountain bike ?
4999    Is it possible to take a train trip from Santi...
Name: question_lemmatized, Length: 5000, dtype: object