## Libraries
(all libraries you need you can find in requirements.txt)

In [1]:
# MAIN
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from collections import Counter
import gc
import pickle
from scripts.storing import *

# NLP
import spacy
import string
import nltk
from nltk.corpus import stopwords
from scripts.cleanup import CleanUpText

# SKLEARN
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, make_scorer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data loading and Labels to Numbers

In [2]:
reviews = pd.read_csv('deceptive-opinion.csv')
reviews['deceptive'] = reviews['deceptive'].apply(lambda x: 0 if x == 'truthful' else 1)
reviews['polarity'] = reviews['polarity'].apply(lambda x: 1 if x == 'positive' else 0)
reviews.head(3)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,0,conrad,1,TripAdvisor,We stayed for a one night getaway with family ...
1,0,hyatt,1,TripAdvisor,Triple A rate with upgrade to view room was le...
2,0,hyatt,1,TripAdvisor,This comes a little late as I'm finally catchi...


## IT'S A FRANKENSTEIN! (aka Pipeline)
P.S. And cross validation F1 score

In [3]:
sfilter = Pipeline([
    ('cleanup_text', CleanUpText()),
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LogisticRegression(solver='lbfgs'))
])

In [4]:
%%time
scores = cross_val_score(sfilter, reviews['text'], reviews['deceptive'], scoring=make_scorer(f1_score), cv=10)
print("Average F1 score (by 10 folds): %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Average F1 score (by 10 folds): 0.88 (+/- 0.06)
Wall time: 1min 6s


In [5]:
_ = sfilter.fit(reviews['text'], reviews['deceptive'])

## Model storing

In [6]:
save_model(sfilter, 'models/sfilter.pkl')

Model was saved in models/sfilter.pkl
Use load_model to load model from file.
