In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
import pickle

In [None]:
df = pd.read_csv("dataset/news.csv")

In [5]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [6]:
df["content"] = df["title"].astype(str) + " " + df["text"].astype(str)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,content
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,You Can Smell Hillary’s Fear Daniel Greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,Kerry to go to Paris in gesture of sympathy U....
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,The Battle of New York: Why This Primary Matte...


In [8]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
content       0
dtype: int64

In [9]:
df.duplicated().sum()

np.int64(0)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df['content'], df['label'], test_size=0.2, random_state=42
)

In [13]:
print(X_test.shape,X_train.shape)
print(y_test.shape,y_train.shape)

(1267,) (5068,)
(1267,) (5068,)


In [14]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [15]:
model = LogisticRegression(max_iter=1000) 
model.fit(X_train_tfidf, y_train)

In [20]:
y_pred = model.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)
confustion_matrix = confusion_matrix(y_test, y_pred, labels=None, sample_weight=None, normalize=None)

In [21]:
print(acc)
print(confustion_matrix)

0.9139700078926598
[[586  42]
 [ 67 572]]


In [28]:
with open("models/model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("models/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)