### Importing all the required libraries....

In [93]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import wordnet
# import nltk
# nltk.download('words')
from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import math
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


# 1. Selecting three categories and scrapping through the website to create a dataset....

##### Assigning the 3 URLs required....

In [94]:
url1_auto = 'http://mlg.ucd.ie/modules/yalp/cafes_list.html'
url2_hotels = 'http://mlg.ucd.ie/modules/yalp/hotels_list.html'
url3_fashion = 'http://mlg.ucd.ie/modules/yalp/restaurants_list.html'

### Function to fetch the data from the site.... 

In [95]:
# creating a function to fetch the data from the website and store, clean it and store it in proper columns - 'Text' and 'Rating'.... 

def populate_dataframe(url):
    
    response = get(url)
    html_soup = BeautifulSoup(response.text,'html.parser')
    #to fetch all the links 
    links = html_soup.find_all('h5')
    # storing the link's pre details in a seperate variable....
    link_pre = 'http://mlg.ucd.ie/modules/yalp/'
    
    
    for i in range(len(links)):
        if i == 0: # to define the data frame for the first time
            data_set1 = pd.DataFrame(columns=['Rating'])
            data_set2 = pd.DataFrame(columns=['Text'])
            
        url = link_pre+(links[i].find('a')['href']) # to store the individual link in the url that can be used to call and fetch the related data
        #Fetching the data from the website and parsing the Beautiful_soup function through it
        response = get(url)
        html_soup = BeautifulSoup(response.text,'html.parser')
        # finding just the 'rating' and 'review' from the entire parsed webpage
        rating = html_soup.find_all(class_='rating')
        review = html_soup.find_all(class_='review-text')
        # Loop to iterate through the webpage
        for x in range(len(review)):
            data_set2 = data_set2.append({'Text':review[x].text},ignore_index=True)# to append only the text value in the dataset
            rate = (rating[x].find('img')['alt'])# to get only ratings
            data_set1 = data_set1.append({'Rating':rate},ignore_index=True) # to store only the ratings
    
    data_set1['Text'] = data_set2
    return data_set1 
    
            





### Calling the function....

In [96]:
# Calling function to populate th data frames

data_auto = populate_dataframe(url1_auto)
data_fash = populate_dataframe(url2_hotels)
data_hotel = populate_dataframe(url3_fashion)

### Adding a lable column to assign the overall sentiment for the dataset....

In [97]:
# adding one column to store the lable "positive" and "negative"

def rating_label(rating):
    if (rating == '3-star')or(rating=='2-star')or(rating=='1-star'):
        return "negative"
    else :
        return "positive"

data_auto['class_label'] = data_auto.apply(lambda i:rating_label(i['Rating']),axis=1)
data_fash['class_label'] = data_fash.apply(lambda i:rating_label(i['Rating']),axis=1)
data_hotel['class_label'] = data_hotel.apply(lambda i:rating_label(i['Rating']),axis=1)

# 2. Preprocessing and applying the TFIDF vectoriser to each of the dataset, fitting the dataset to a model and evaluating its performance....

### Functions for preprocessing the data/ cleaning the data....

In [98]:
# Starting of the preprocessing steps for the dataframes

# Lower Case  the text elements to ensure uniformity


def lower_case_data(rating):
    new_text = ""
    for i in rating:
        new_text = new_text + "" + i.lower()
    return new_text


# Removing the stop words as they don't provide any additional value to the dataset
stpwrd = stopwords.words('english')

def remove_stop_words(rating):
    rating = word_tokenize(rating)
    new_text = ""
    for i in rating:
        if i not in stpwrd:
            new_text = new_text + " " + i
        
    return new_text    

# Removing the unnessary symbols from the dataset
symbols = "!\"'#$%&()*+-./:;<=>?@[\]^_`{|}~\n"

def remove_punct(rating):
    new_text = ""
    for i in rating:
        if i not in symbols:
            new_text = new_text + i
        #rating = np.char.replace(rating,i,' ')
    return new_text

######### Can add logic to remove the Apostrophy

# Removing numbers from the dataset
numbers = "0123456789"

def remove_num(rating):
    new_text = ""
    for i in rating:
        if i not in numbers:
            new_text = new_text + i
    return new_text


# Removing Single Character from the dataset

def remove_single_chr(rating):
    rating = word_tokenize(rating)
    new_text = ""
    for i in rating:
        if len(i) > 1:
            new_text = new_text + " " + i
    return new_text 

# Stemming the words to its verb or noun word (Stemming) returns word that can/cannot be in the dictionary
# If using lemmatization, use it first then use stemmers

ps = PorterStemmer()

def stemming_data(rating):
    new_text = ""
    rating = word_tokenize(rating)
    for i in rating:
        new_text = new_text + " " + ps.stem(i)
    return new_text




### Calling a single function to complete all the other preprossesing steps in one go....

In [99]:
# definig one function to carry out all sub-function operations
def preproccesing_data(dataframe):
    dataframe['Text'] = dataframe.apply(lambda i:lower_case_data(i['Text']),axis=1)
    dataframe['Text'] = dataframe.apply(lambda i:remove_punct(i['Text']),axis=1)
    dataframe['Text'] = dataframe.apply(lambda i:remove_num(i['Text']),axis=1)
    dataframe['Text'] = dataframe.apply(lambda i:remove_single_chr(i['Text']),axis=1)
   # dataframe['Text'] = dataframe.apply(lambda i:remove_non_eng(i['Text']),axis=1)
    dataframe['Text'] = dataframe.apply(lambda i:remove_stop_words(i['Text']),axis=1)
    dataframe['Text'] = dataframe.apply(lambda i:stemming_data(i['Text']),axis=1)
    
    

In [100]:
preproccesing_data(data_auto)
preproccesing_data(data_fash)
preproccesing_data(data_hotel)
       

In [101]:
# to validate....
data_auto.head()

Unnamed: 0,Rating,Text,class_label
0,4-star,pro lot item would expect chines bakeri frien...,positive
1,4-star,best eggtart town there realli much say hong ...,positive
2,2-star,ive abc bakeri time sinc read one top place e...,negative
3,1-star,fyi close monday new ownership week new menu ...,negative
4,4-star,insid may look like much make mean egg tart g...,positive


### Applying the SkLearn method....

### Creating an instance of the classifiers....

In [102]:
#Create a Classifiers

log = LogisticRegression(solver='lbfgs')
ran = RandomForestClassifier()

#### Creating a function to vectorise the dataset....

In [103]:
# trying the sklearn method

def vectorizer(dataset):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(dataset['Text'])
    
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    dataset_auto_vectorised = pd.DataFrame(denselist, columns=feature_names)
    return dataset_auto_vectorised

#### Calling the vectorisere function....

In [104]:
data_auto_vec = vectorizer(data_auto)
data_fash_vec = vectorizer(data_fash)
data_hotel_vec = vectorizer(data_hotel)

### Function to filter, predict and evaluate results in one function....

In [105]:
def fitter_predict(dataset,original_dataset):
    X_train, X_test, y_train, y_test = train_test_split(dataset,original_dataset['class_label'],test_size=0.30)

    log.fit(X_train,y_train)

    y4_pred = log.predict(X_test)

    print("Accuracy log:",metrics.accuracy_score(y_test, y4_pred))
#     print("Accuracy ran:",metrics.accuracy_score(y_test, y5_pred))






### Calling function to print the results....

In [106]:
print('Results for Dataset - Cafe')
fitter_predict(data_auto_vec,data_auto)
print('Results for Dataset - Restaurants')
fitter_predict(data_fash_vec,data_fash)
print('Results for Dataset - Hotel')
fitter_predict(data_hotel_vec,data_hotel)

Results for Dataset - Cafe
Accuracy log: 0.8016666666666666
Results for Dataset - Restaurants
Accuracy log: 0.8766666666666667
Results for Dataset - Hotel
Accuracy log: 0.87


#### We have seen the use of the metric accuracy score for this metric, while using the logistic regression for applying the fitting and predicting model, an accuracy of 82% to 89%, is achieved. While compared with other estimators, this is the highest. The Dataset Restaurant returns the highest accuracy among the three. We have used the metrics.accuracy_score method here to calculate the accuracy of the dataset, it computes the subset accuracy,the set of labels predicted for a sample must exactly match the corresponding set of labels, all the correct predictors are given as the ration of the total predictons made...

# 3. Evaluating the performance of each of the three classificaions when called on each of the dataset....

### Creating a function to apply the fit and transform methods on the dataset

#### The overall result will be less as compared to the individual dataset as the vectorized datasets have irregular number of columns (three of them will have different number of columns) and when we try to fit and transform, all the extra data will be lost, leading to reduction in the accuracy score.
#### As mentioned, we have selected data from similar backgrounds - Restaurants, hotels and Cafes, hence we can expect better scores than what we could have got if we choose dis-similar datasets like automobile, Gym and hotel.

In [107]:
def fit_transform_predict_evaluate(dataset_fit,dataset_trans1,dataset_trans2):
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit(dataset_fit['Text'])
    trans_a = vectors.transform(dataset_fit['Text'])
    trans_b = vectors.transform(dataset_trans1['Text'])
    trans_c = vectors.transform(dataset_trans2['Text'])
    # to create a new dataset using the reduced/increased columns
    feature_names_a = vectors.get_feature_names()
    dense_a = trans_a.todense()
    dense_b = trans_b.todense()
    dense_c = trans_c.todense()
    # to generate a list item....
    denselist_a = dense_a.tolist()
    denselist_b = dense_b.tolist()
    denselist_c = dense_c.tolist()
    # to generate the new datasets....
    data_set_train = pd.DataFrame(denselist_a, columns=feature_names_a)
    data_set_test1 = pd.DataFrame(denselist_b,columns=feature_names_a)
    data_set_test2 = pd.DataFrame(denselist_c,columns=feature_names_a)
    
    # fitting using log regression....
    ran = RandomForestClassifier()
    
    #Create a Classifiers
    
    log = LogisticRegression(solver='lbfgs')
    
    log.fit(data_set_train,dataset_fit['class_label'])
    
    # predicting the results
    y2_pred = log.predict(data_set_test1)
    y3_pred = log.predict(data_set_test2)
    
    
    
    
    # Evaluating the results....
    
    print("Accuracy #1 :",metrics.accuracy_score(dataset_trans1['class_label'], y2_pred))
    print("Accuracy #2:",metrics.accuracy_score(dataset_trans2['class_label'], y3_pred))
   

   
    
    

In [108]:
print('When choosing the dataset "Cafe" to train the model and to predict the dataset #1"Restaurants" and #2"Hotel" we receive the below Accuracy Scores  ')
fit_transform_predict_evaluate(data_auto,data_fash,data_hotel)

print('When choosing the dataset "Hotel" to train the model and to predict the dataset #1"Restaurants" and #2"Cafe" we receive the below Accuracy Scores  ')
fit_transform_predict_evaluate(data_hotel,data_fash,data_auto)

print('When choosing the dataset "Restaurants" to train the model and to predict the dataset #1"Cafe" and #2"Hotel" we receive the below Accuracy Scores  ')
fit_transform_predict_evaluate(data_fash,data_auto,data_hotel)

When choosing the dataset "Cafe" to train the model and to predict the dataset #1"Restaurants" and #2"Hotel" we receive the below Accuracy Scores  
Accuracy #1 : 0.8165
Accuracy #2: 0.85
When choosing the dataset "Hotel" to train the model and to predict the dataset #1"Restaurants" and #2"Cafe" we receive the below Accuracy Scores  
Accuracy #1 : 0.817
Accuracy #2: 0.86
When choosing the dataset "Restaurants" to train the model and to predict the dataset #1"Cafe" and #2"Hotel" we receive the below Accuracy Scores  
Accuracy #1 : 0.8325
Accuracy #2: 0.816


## As seen from the above scores, we have found an effective model that could train the dataset, and get results upto as high as 86%, even when there are extra words in the dataset, which are erstwhile not present in the training dataset, this happens because we fit and transform the training dataset into the testing dataset, resulting into the same feature names being considered.