In [5]:
import numpy as np
import pandas as pd
import re

import nltk
import spacy
nlp = spacy.load('en')
import textacy

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/vithulanv/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
col_names = ['questions', 'a', 'b']
data_df = pd.read_csv("https://raw.githubusercontent.com/VIthulan/travel-text-classification/master/data/5000TravelQuestionsDataset.csv", error_bad_lines=False,header=None, names=col_names, encoding='latin-1')

In [7]:
data_df.head()

Unnamed: 0,questions,a,b
0,What are the special things we (husband and me...,TTD,TTDSIG
1,What are the companies which organize shark fe...,TTD,TTDOTH
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA
3,What are the best places around Cape Town for ...,TTD,TTDSIG
4,What are the best places to stay for a family ...,ACM,ACMOTH


In [14]:
data_df['a'].value_counts()

TGU      1217
TTD      1139
TRS      1011
ACM       720
FOD       521
ENT       214
WTH       172
TGU\n       3
\nENT       2
TTD\n       1
Name: a, dtype: int64

# Text Preprocessing

In [8]:
stop_words = set(stopwords.words('english')) 

def remove_stopwords(text):
  nltk.word_tokens = nltk.word_tokenize(text) 
  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  return " ".join(filtered_sentence)

In [10]:
# Remove all the special characters
data_df['processed_questions'] = data_df['questions'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_questions'] = data_df['processed_questions'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_questions'] = data_df['questions'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_questions'] = data_df['questions'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_questions'] = data_df['questions'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_questions'] = data_df['questions'].str.strip()
# Stop word removal
data_df['sw_removed_questions'] = data_df.processed_questions.apply(remove_stopwords)

In [18]:
# Remove all the special characters
data_df['processed_a'] = data_df['a'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_a'] = data_df['a'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_a'] = data_df['a'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_a'] = data_df['a'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_a'] = data_df['a'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_a'] = data_df['a'].str.strip()

## Lemmatizing


In [15]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lem = [lemmatizer.lemmatize(w, pos="v") for w in nltk.word_tokenize(text)]
  return " ".join(lem)

In [16]:
data_df["question_lemmatized"] = data_df.processed_questions.apply(lemmatize_text)

In [17]:
data_df["question_lemmatized_sw"] = data_df.sw_removed_questions.apply(lemmatize_text)

In [19]:
data_df.head()

Unnamed: 0,questions,a,b,processed_questions,sw_removed_questions,question_lemmatized,question_lemmatized_sw,processed_a
0,What are the special things we (husband and me...,TTD,TTDSIG,What are the special things we (husband and me...,What special things ( husband ) 5 day stay Cap...,What be the special things we ( husband and me...,What special things ( husband ) 5 day stay Cap...,TTD
1,What are the companies which organize shark fe...,TTD,TTDOTH,What are the companies which organize shark fe...,What companies organize shark feeding events s...,What be the company which organize shark feed ...,What company organize shark feed events scuba ...,TTD
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA,Is it safe for female traveller to go alone to...,Is safe female traveller go alone Cape Town ?,Is it safe for female traveller to go alone to...,Is safe female traveller go alone Cape Town ?,TGU
3,What are the best places around Cape Town for ...,TTD,TTDSIG,What are the best places around Cape Town for ...,What best places around Cape Town safari ?,What be the best place around Cape Town for sa...,What best place around Cape Town safari ?,TTD
4,What are the best places to stay for a family ...,ACM,ACMOTH,What are the best places to stay for a family ...,What best places stay family stay away nightli...,What be the best place to stay for a family to...,What best place stay family stay away nightlife ?,ACM


In [20]:
data_df['processed_a'].value_counts()

TGU    1220
TTD    1140
TRS    1011
ACM     720
FOD     521
ENT     216
WTH     172
Name: processed_a, dtype: int64

# FastText

In [None]:
from gensim.models.wrappers import FastText
