In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
data_path = "data.csv"
test_data_path = "test.csv"
data_raw = pd.read_csv(data_path)
test_data = pd.read_csv(test_data_path)
print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
print("**Sample data:**")
data_raw.head()

Number of rows in data = 33944
Number of columns in data = 12


**Sample data:**


Unnamed: 0,City,Place,Review,Adventure & Outdoors,Spiritual,Nature & Retreat,Isolated or Hippie,Heritage,Travel & Learn,Social Tourism (Volunteer & Travel),Nightlife & Events,Shopping
0,Agra,Rajeshwar Mandir,Though the city does not have much this kind o...,0,1,1,0,0,0,0,0,0
1,Agra,Rajeshwar Mandir,"When we plan a trip to Agra, we plan to visit ...",0,1,1,0,0,0,0,0,0
2,Agra,Rajeshwar Mandir,Any ancient monument specially Mandir's have t...,0,1,1,0,0,0,0,0,0
3,Agra,Rajeshwar Mandir,"This is a very Old Shiva Temple ,earlier it wa...",0,1,1,0,0,0,0,0,0
4,Agra,Rajeshwar Mandir,A very beautiful Mandir has to loose its grace...,0,1,1,0,0,0,0,0,0


In [6]:
categories = list(data_raw.columns.values)[3:12]
print(categories)

['Adventure & Outdoors', 'Spiritual', 'Nature & Retreat', 'Isolated or Hippie', 'Heritage', 'Travel & Learn', 'Social Tourism (Volunteer & Travel)', 'Nightlife & Events', 'Shopping']


In [7]:
#Data Pre-Processing

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings
data = data_raw
if not sys.warnoptions:
    warnings.simplefilter("ignore")
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned
def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent
test_data['Review'] = test_data['Review'].str.lower()
test_data['Review'] = test_data['Review'].apply(cleanHtml)
test_data['Review'] = test_data['Review'].apply(cleanPunc)
test_data['Review'] = test_data['Review'].apply(keepAlpha)

In [8]:
#Removing stop words
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)
test_data['Review'] = test_data['Review'].apply(removeStopWords)

In [9]:
#Stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence
test_data['Review'] = test_data['Review'].apply(stemming)

In [15]:
# test and train data partitioning...

from sklearn.model_selection import train_test_split

#train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
original_test_data = test_data
test = test_data
print(train.shape)
print(test.shape)

(23760, 12)
(2, 1)


In [18]:
train_text = train['Review']
test_text = test['Review']
#print("trian")
#print(train)
print("test")
print(test_text)

test
0    though citi much kind old hindu structur symbo...
1    rajeshwar mahadev templ year old said shivalin...
Name: Review, dtype: object


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [19]:
x_train = vectorizer.transform(train_text)
#print("train_text")
#print(train_text)
#print("x_train")
#print(x_train)
y_train = train.drop(labels = ['Review'], axis=1)
print(y_train)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['Review'], axis=1)

            City                                    Place  \
1083        Agra                                Agra Fort   
17655       Agra                              Mehtab Bagh   
29791  Rishikesh                             Triveni Ghat   
9161        Agra                                Agra Fort   
22878  Mussoorie                                Lal Tibba   
27866  Rishikesh                       The Beatles Ashram   
14611       Agra                              Anguri Bagh   
20271  Mussoorie                             Kempty Falls   
13205       Agra                 Tomb of Itimad-ud-Daulah   
23532    Pushkar                             Pushkar Lake   
7172        Agra                                Agra Fort   
22051  Mussoorie                        Camel's Back Road   
13302       Agra                 Tomb of Itimad-ud-Daulah   
21236  Mussoorie                             Kempty Falls   
8874        Agra                                Agra Fort   
20145  Mussoorie        

In [25]:
#Multiple Binary Classifications - (One Vs Rest Classifier)

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from IPython.display import Markdown, display
import pickle
def printmd(string):
    display(Markdown(string))

%time

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

arrs = []
#our_model = pickle.dumps(LogReg_pipeline)
#LogReg_pipeline = pickle.loads(our_model)
trained_model = open("trained_model.txt", "r")
pipeline_array = list(trained_model.read())
#pipeline_array = trained_model.read()
#print(str(pickle.dumps(LogReg_pipeline)))
print(type(LogReg_pipeline))
for index in range(0,len(categories)):
    printmd('**Processing {} review...**'.format(categories[index]))
    
    # Training logistic regression model on train data
    #print("x_train")
    #print(x_train)
    #LogReg_pipeline.fit(x_train, train[category])
    LogReg_pipeline = pickle.loads(bytes(pipeline_array[index]))
    #print(train[category])
    # calculating test accuracy
    #print("x_test")
    #print(x_test)
    prediction = LogReg_pipeline.predict(x_test)
    arrs.append(prediction)
    print("Prediction: ")
    print(prediction)
    #print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

output_array = []
output_array.append(['Review','Adventure & Outdoors', 'Spiritual', 'Nature & Retreat', 'Isolated or Hippie', 'Heritage', 'Travel & Learn', 'Social Tourism (Volunteer & Travel)', 'Nightlife & Events', 'Shopping'])
test_review = original_test_data["Review"].values
for index in range(0,len(test_review)):
    row = []
    row.append(test_review[index])
    for arr in arrs:
        row.append(arr[index])
    output_array.append(row)

    
with open('output.csv', 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(output_array)    
#print(output_array)    

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs
<class 'sklearn.pipeline.Pipeline'>


**Processing Adventure & Outdoors review...**

TypeError: string argument without an encoding