In [11]:
import pandas as pd
import seaborn as sns
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

import pickle

In [2]:
data = pd.read_csv('fake_or_real_cleaned.csv')

In [4]:
class TextTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.str.lower().str.strip()
        translator = str.maketrans('', '', string.punctuation+'’‘—“”–')
        X = X.map(lambda x: x.strip().lower().translate(translator))
        stopwords = nltk.corpus.stopwords.words('english')
        wordnet_lemmatizer = WordNetLemmatizer()
        X = X.map(word_tokenize).apply(lambda x: [word for word in x if word not in stopwords])
        X = X.apply(lambda x: [wordnet_lemmatizer.lemmatize(w, pos="v") for w in x])
        X = X.apply(lambda x: ' '.join(x)).to_numpy()
        return X

In [7]:
pipeline = Pipeline([
    ("text", TextTransformer()),
    ("tfidf", TfidfVectorizer()),
    ("clf", PassiveAggressiveClassifier())
])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data['text'],data['label'], random_state=0)

In [9]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('text', TextTransformer()), ('tfidf', TfidfVectorizer()),
                ('clf', PassiveAggressiveClassifier())])

In [10]:
pipeline.score(X_test,y_test)

0.922003804692454

In [13]:
pickle.dump(pipeline, open('pipeline', 'wb'))