In [2]:
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV




In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("/Users/sanju/Desktop/Own Projects/data/corona_fake.csv")


In [8]:
df.loc[df['label'] == 'Fake', ['label']] = 'FAKE'
df.loc[df['label'] == 'fake', ['label']] = 'FAKE'
df.loc[df['source'] == 'facebook', ['source']] = 'Facebook'
df.text.fillna(df.title, inplace=True)

df.loc[5]['label'] = 'FAKE'
df.loc[15]['label'] = 'TRUE'
df.loc[43]['label'] = 'FAKE'
df.loc[131]['label'] = 'TRUE'
df.loc[242]['label'] = 'FAKE'

df = df.sample(frac=1).reset_index(drop=True)
df.title.fillna('missing', inplace=True)
df.source.fillna('missing', inplace=True)

df['title_text'] = df['title'] + ' ' + df['text']

In [9]:
df['label'].value_counts()

TRUE    586
FAKE    578
Name: label, dtype: int64

In [10]:
df.head()

Unnamed: 0,title,text,source,label,title_text
0,DEPOP TAG-TEAM: Anthony Fauci joins Bill Gates...,In lockstep with Mr. Microsoft (Bill Gates) hi...,https://www.naturalnews.com/,FAKE,DEPOP TAG-TEAM: Anthony Fauci joins Bill Gates...
1,dr fauci caught sponsoring wuhan lab millions,"But just last year, the National Institute for...",https://www.citadelpoliticss.com/,FAKE,dr fauci caught sponsoring wuhan lab millions ...
2,Are there any specific medicines to prevent or...,"To date, there is no specific medicine recomme...",https://www.who.int/,TRUE,Are there any specific medicines to prevent or...
3,Should I go to the doctor or dentist for nonur...,"During this period of social distancing, it is...",https://www.health.harvard.edu/,TRUE,Should I go to the doctor or dentist for nonur...
4,Do we know if the virus can enter through the ...,"There is no evidence yet that SARS-CoV-2, the ...",https://www.globalhealthnow.org/,TRUE,Do we know if the virus can enter through the ...


In [11]:
df['title_text'][50]


"SARS-CoV-2 — A Biological Warfare Weapon? SARS-CoV-2 — A Biological Warfare Weapon. “Novel coronavirus” means it is a new virus not previously known to previously infect humans. The currently held conventional view is that SARS-CoV-2 was transmitted through animals (zoonotic transmission), specifically bats. Boyle dismissed this notion in our initial interview, and still refutes the idea. While a widely-cited paper,2 published in the Nature journal on February 3, 2020, claims to establish that SARS-CoV-2 is a coronavirus of bat origin that then jumped species, the work of one of the authors of that paper, Shi Zhengli, actually involved the weaponization of the SARS virus. (Another Nature paper3 published that same day reiterates the idea that the COVID-19 pandemic is zoonotically transmitted.) However, according to Boyle, other scientific literature establishes that this is indeed an engineered synthetic virus that was not transmitted from animals to humans without human intervention.

In [12]:
def preprocessor(text):
    
    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]','', text)
    text = text.lower()

    return text

In [13]:
df['title_text'] = df['title_text'].apply(preprocessor)

In [14]:
df['title_text'][50]

'sarscov2  a biological warfare weapon sarscov2  a biological warfare weapon novel coronavirus means it is a new virus not previously known to previously infect humans the currently held conventional view is that sarscov2 was transmitted through animals zoonotic transmission specifically bats boyle dismissed this notion in our initial interview and still refutes the idea while a widelycited paper2 published in the nature journal on february 3 2020 claims to establish that sarscov2 is a coronavirus of bat origin that then jumped species the work of one of the authors of that paper shi zhengli actually involved the weaponization of the sars virus another nature paper3 published that same day reiterates the idea that the covid19 pandemic is zoonotically transmitted however according to boyle other scientific literature establishes that this is indeed an engineered synthetic virus that was not transmitted from animals to humans without human intervention for starters a lancet paper4 publis

In [15]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [16]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        tokenizer=tokenizer_porter,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)
X = tfidf.fit_transform(df['title_text'])
y = df.label.values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.5, shuffle=False)

clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train, y_train)

fake_news_model = open('fake_news_model.sav', 'wb')
pickle.dump(clf, fake_news_model)
fake_news_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.0s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.0s finished


In [18]:
filename = 'fake_news_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))

saved_clf.score(X_test, y_test)

0.9106529209621993

In [19]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = clf.predict(X_test)
print("---Test Set Results---")
print("Accuracy with logreg: {}".format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

---Test Set Results---
Accuracy with logreg: 0.9106529209621993
              precision    recall  f1-score   support

        FAKE       0.91      0.91      0.91       289
        TRUE       0.91      0.91      0.91       293

    accuracy                           0.91       582
   macro avg       0.91      0.91      0.91       582
weighted avg       0.91      0.91      0.91       582

