# Project 5: Group Project
#### Author: Adam Pardo, Brandon Bergeron, Eric Bayless, Ramesh Babu

### 02 - ML modeling  

Task: Comparing different Maching Learning models on our data

Information: 

In [40]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

## Functions

In [63]:
#---Custom Lemmatizer takes df and adds lemmatized review column for ngram plotting

def custom_lemmatize(df, stop_words=False):
    """
    Takes a df of reviews and lemmatizes them for plotting and EDA
    
    ARGS:
    
        df: DataFrame with reviews
        stop_words (bool): whether or not to remove stop_words (default=False)
    """
    
    #--instantiate Lemmatizer and new dictionary to house altered posts
    lemmatizer = WordNetLemmatizer()
    df['lemmatized'] = None
    
    #--iterates over reviews in df
    for i in range(0,len(df['text'])):
        
        #--converting review to lowercase and list of stopwords
        
        post = df['text'][i]
        post = post.lower()
        stops = stopwords.words('english')

        #--stripping the post of non-alpha-numeric characters
        
        exclusion_list = ['[^a-zA-Z]']
        exclusions = '|'.join(exclusion_list)
        post = re.sub(exclusions, ' ' , post)
        
        #---Lemmatizing words, removing stop_words if specified when called
        if stop_words == True:
            words = [lemmatizer.lemmatize(word) for word in post.split() if word not in stops]
        else:
            words = [lemmatizer.lemmatize(word) for word in post.split()]
            
        #---Adding lemmatized post to dictionary
        df['lemmatized'][i] = ' '.join(words)

In [65]:
#---CountVectorizes and pulls highest occuring ngrams for given ngram range

def plot_ngrams(df, ngrams, amount):
    
    '''
    Takes in a df of posts and plots the highest occuring given ngram range
    
    ARGS:
    
        df: dataframe
        ngrams (tuple): ngram range desired
        amount (int): number of highest occuring ngrams desired
    '''

    #---CountVectorizing words to get importances
    cvect = CountVectorizer(stop_words='english', ngram_range=ngrams, max_features=500)
    counts = cvect.fit_transform(df['lemmatized'])
    
    df_cvect = pd.DataFrame(counts.toarray(), columns = cvect.get_feature_names())
    
    df_cvect.sum().sort_values(ascending=False).head(amount).plot(kind='barh');

In [54]:
#--formatted printing for model scores

def print_scores(model):
    print(f'train score: {model.score(x_train, y_train)}')
    print(f'test score: {model.score(x_test, y_test)}')

## Reading in reviews and combining

In [49]:
# combines all reviews for each restaurant into one observation

#-read in reviews
df_reviews = pd.read_csv('./data/Las_Vegas_400_reviews.csv')

#--combine all reviews
df_revs_combined = df.groupby(['business_id', 'name', 
                               'address', 'city' ,
                               'state', 'postal_code', 
                               'latitude' ,'longitude' , 
                               'stars', 'review_count', 
                               'is_open', 'attributes', 'categories']).agg({'text': ' '.join})

#--reset index and add review length column for total 
df_revs_combined = df_revs_combined.reset_index()
df_revs_combined['review_wc'] = df_revs_combined['text'].str.split().str.len()

In [53]:
df_revs_combined.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,text,review_wc
0,-0RkJ_uIduNLWQrphbADRw,Rooster Boy Cafe,"2620 Regatta Dr, Ste 113",Las Vegas,NV,89128,36.207539,-115.268154,4.0,194,1,"{'WheelchairAccessible': 'True', 'RestaurantsP...","Coffee & Tea, Restaurants, Cafes, Food, Breakf...",Amazing food and service. So grateful for the ...,24200


# Modeling

In [34]:
pipe = make_pipeline(CountVectorizer(max_features=500), 
                     StandardScaler(with_mean=False),
                     RandomForestClassifier()
                    )

In [36]:
x = df_temp_all['text']
y = df_temp_all['is_open']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=251)

In [37]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer(max_features=500)),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('randomforestclassifier', RandomForestClassifier())])

In [39]:
print_scores(pipe)

train score: 1.0
test score: 0.6910994764397905
