## NLP With Hotel Review Part 2

### Pallavi Chintaluri

In [1]:
# Import base packages. Other specific packages will be imported at the time of modelling

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Utilise helper functions to enhance visualizations

def PlotBoundaries(model, X, Y, dot_size=20, figsize=(10,7)) :
    '''
    Helper function that plots the decision boundaries of a model and data (X,Y)
    code modified from: https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html
    '''
    
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1,X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure(figsize=figsize)
    plt.contourf(xx, yy, Z, alpha=0.4)

    #Plot
    plt.scatter(X[:, 0], X[:, 1], c=Y, s=dot_size, edgecolor='k')
    plt.show()

In [10]:
# Import data 
clean_test_df = pd.read_csv(r'C:\Users\palla\clean_data\clean_test_dataframe.csv')
clean_train_df = pd.read_csv(r'C:\Users\palla\clean_data\clean_train_dataframe.csv')

# Seperate X and y variables for the two datasets
# The training data is called remain data to facilitate train-validate split for later questions
X_remain = clean_train_df.drop(columns = 'rating')
y_remain = clean_train_df['rating']

X_test = clean_test_df.drop(columns = 'rating')
y_test = clean_test_df['rating']

# Create train and validate sets
from sklearn.model_selection import train_test_split

X_train, X_validate, y_train, y_validate = \
    train_test_split(X_remain, y_remain, test_size = 0.3,
                     random_state=1)

# Create a small sample set as well
X_validate, X_sample, y_validate, y_sample = \
    train_test_split(X_validate, y_validate, test_size = 0.5)