In [1]:
import pandas as pd
import numpy as np

# Load the data

In [2]:
reviews_data = pd.read_csv('reviews.csv')
reviews_data = reviews_data[['review', 'polarity']]

# Drops rows that have missing values
reviews_data.dropna(axis=0, inplace=True)

# Converts the target variable data type to int type
reviews_data['polarity'] = reviews_data['polarity'].astype(int)

reviews_data.head(10)

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
5,idk i can't edit my posts? things such as my ...,0
6,major flaws constant updates and always getti...,0
7,video issues since i was forced into this upd...,0
8,this update completely destroyed my facebook...,0
9,"posting issues for the last week, there's bee...",0


# Let's look at some of the comments

In [3]:
reviews_data.iloc[2]['review']

" profile any time my wife or anybody has more than one post and i view them it would take me to there profile so that i can view them all at once. now when i try to view them it tells me that the page that i requested is not available. i've restarted my phone i even cleard the cache and i've uninstalled and reinstalled the app and it is still doing it. please fix it thank you"

# Count the number of spam and non-spam emails

In [4]:
reviews_data['polarity'].value_counts()

0    584
1    307
Name: polarity, dtype: int64

This is an imbalanced dataset. We use the AUC metric for model evaluation.

# Comparison of average lengths for spam and non-spam emails

In [5]:
reviews_data['length'] = reviews_data['review'].apply(len)
reviews_data.head()

Unnamed: 0,review,polarity,length
0,privacy at least put some option appear offli...,0,386
1,"messenger issues ever since the last update, ...",0,457
2,profile any time my wife or anybody has more ...,0,378
3,the new features suck for those of us who don...,0,193
4,forced reload on uploading pic on replying co...,0,466


In [6]:
reviews_data.groupby(by='polarity').mean()

  reviews_data.groupby(by='polarity').mean()


Unnamed: 0_level_0,length
polarity,Unnamed: 1_level_1
0,247.719178
1,201.729642


# Split the data into test and train sets

In [7]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(reviews_data['review'], 
                                                    reviews_data['polarity'], 
                                                    shuffle=True,
                                                    random_state=0)

# Gaussian Naive Bayes with CountVectorizer feature extraction

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score

def count_vectorizer_method():
    # Apply count vectorizer to each email
    vect = CountVectorizer(stop_words={'english'}).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    
    # Calls the Gaussian Naive Bayes 
    model = MultinomialNB(alpha=.2) # We note that we can always use other ML algorithms.
    model.fit(X_train_vectorized.toarray(), y_train)
    
    # Make predictions
    predictions = model.predict(vect.transform(X_test).toarray())
    
    # AUC score
    model_auc_score = roc_auc_score(y_test, predictions)

    return model, model_auc_score, vect

In [9]:
count_vectorizer_method()[1]

0.8122814685314685

# TF-IDF feature extraction

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def TF_IDF_vectorizer_method():
    # TF-IDF vecotrizer. Min_df is a threshold value for ignoring words that have a count of less than the given min_df value.
    vect = TfidfVectorizer(min_df=3, stop_words={'english'}).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    
    # Multinomial Naive Bayes
    model = MultinomialNB(alpha=.1)
    model.fit(X_train_vectorized, y_train)
    
    # Make predictions
    predictions = model.predict(vect.transform(X_test))
    
    # AUC score
    auc_score = roc_auc_score(y_test, predictions)
    
    return model, auc_score

In [11]:
TF_IDF_vectorizer_method()[1]

0.7825174825174825

# Let's test out the better model with some sample reviews.

In [13]:
_, model, vect = count_vectorizer_method()

comments = ['Really good movie', "I didn't enjoy the movie"]
predictions = count_vectorizer_method()[0].predict(vect.transform(comments).toarray())

review_type = ['negative', 'positive']
print([review_type[i] for i in predictions])

['positive', 'negative']
