In [1]:
import numpy as np
import pandas as pd

In [None]:
yelp.head()

In [None]:
yelp.info()

In [None]:
yelp.describe()

In [None]:
yelp['text length'] = yelp['text'].apply(len)

## EXPLORATORY DATA ANALYSIS

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

In [None]:
#Use FacetGrid from the seaborn library to create a grid of 5 histograms of text length based 
#off of the star ratings. Reference the seaborn documentation for hints on this
g = sns.FacetGrid(yelp,col='stars')
g.map(plt.hist,'text length')

In [None]:
sns.boxplot(x='stars',y='text length',data=yelp,palette='rainbow')

In [None]:
#Create a countplot of the number of occurrences for each type of star rating.
sns.countplot(x='stars',data=yelp,palette='rainbow')

In [None]:
#** Use groupby to get the mean values of the numerical columns,
#you should be able to create this dataframe with the operation:**
stars = yelp.groupby('stars').mean()
stars

In [None]:
# Use the corr() method on that groupby dataframe to produce this dataframe:
stars.corr()

In [None]:
#Then use seaborn 
#to create a heatmap based off that .corr() dataframe:
sns.heatmap(stars.corr(),cmap='coolwarm',annot=True)

### NLP Classification Task
To make things a little easier, go ahead and only grab reviews that were either 1 star or 5 stars.

Create a dataframe called yelp_class that contains the columns of yelp dataframe but for only the 1 or 5 star reviews.

In [None]:
yelp_class = yelp[(yelp.stars==1) | (yelp.stars==5)]

#### Create two objects X and y. X will be the 'text' column of yelp_class and y will be the 'stars' column of yelp_class. (Your features and target/labels)**

In [None]:
X = yelp_class['text']
y = yelp_class['stars']

##### **Import CountVectorizer and create a CountVectorizer object.**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

#### ** Use the fit_transform method on the CountVectorizer object and pass in X (the 'text' column). Save this result by overwriting X.**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)

#### Training a Model
Time to train a model!

** Import MultinomialNB and create an instance of the estimator and call is nb *

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

###### **Now fit nb using the training data.**

In [None]:
nb.fit(X_train,y_train)

#### Predictions and Evaluations
Time to see how our model did!

Use the predict method off of nb to predict labels from X_test.

In [None]:
predictions = nb.predict(X_test)

####  Create a confusion matrix and classification report using these predictions and y_test 

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

#### Using Text Processing

** Import TfidfTransformer from sklearn. **

In [None]:
from sklearn.feature_extraction.text import  TfidfTransformer

In [None]:
#Import Pipeline from sklearn
from sklearn.pipeline import Pipeline

#### Now create a pipeline with the following steps:CountVectorizer(), TfidfTransformer(),MultinomialNB()

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

#### Using the Pipeline
Time to use the pipeline! Remember this pipeline has all your pre-process steps in it already, meaning we'll need to re-split the original data (Remember that we overwrote X as the CountVectorized version. What we need is just the text

#### Train Test Split
Redo the train test split on the yelp_class object.

In [None]:
X = yelp_class['text']
y = yelp_class['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)

#### Now fit the pipeline to the training data. Remember you can't use the same training data as last time because that data has already been vectorized. We need to pass in just the text and labels

In [None]:
# May take some time
pipeline.fit(X_train,y_train)

#### Predictions and Evaluation
** Now use the pipeline to predict from the X_test and create a classification report and confusion matrix. You should notice strange results.**

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))