In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score
import re
import joblib
import string

In [28]:
fake= pd.read_csv('Fake.csv')
true= pd.read_csv('True.csv')

In [29]:
fake['class']=0
true['class']=1

In [30]:
data = pd.concat([fake,true],axis=0)

In [31]:
data.sample(10)

Unnamed: 0,title,text,subject,date,class
14023,Backlash among German MPs against parliamentar...,BERLIN (Reuters) - German lawmakers have prote...,worldnews,"November 23, 2017",1
16129,"Kenya's repeat presidential poll was free, fai...",NAIROBI (Reuters) - Kenya s repeat presidentia...,worldnews,"October 30, 2017",1
9306,Thousands of voters in limbo after Kansas dema...,"WICHITA, Kansas (Reuters) - After moving to Ka...",politicsNews,"June 1, 2016",1
23215,"SOPA False Flag? Alleged ‘Hack’ on Netflix, Tw...","Shawn Helton 21st Century WireYesterday, a wav...",Middle-east,"October 22, 2016",0
20219,HILARIOUS! #BlackLivesMatter Protest Hillary A...,Pandering Hillary s getting a little karma Bla...,left-news,"Jul 26, 2016",0
14672,Hillary Clinton’s Anti-Israel E-mails Raise Qu...,Hillary Clinton s new e-mail release exposes h...,politics,"Jan 10, 2016",0
10619,Clinton wins big in South Carolina on way to '...,"COLUMBIA, S.C. (Reuters) - U.S. Democratic pre...",politicsNews,"February 27, 2016",1
20133,Japan refueling U.S. missile defense ships kee...,TOKYO (Reuters) - Japan s navy is supplying fu...,worldnews,"September 14, 2017",1
15314,(VIDEO) DEMOCRATS FRUSTRATED BY CLINTON JOKING...,The liberal pundits are pretty frustrated by t...,politics,"Aug 17, 2015",0
18993,"UAE law targets sexual harassment, forced labor","DUBAI (Reuters) - The United Arab Emirates, cr...",worldnews,"September 26, 2017",1


In [32]:
data['text'] = data['text']+" "+data['title']

In [33]:
data.head()


Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [34]:
data = data.drop(["title","date"],axis=1)

In [35]:
data.head()

Unnamed: 0,text,subject,class
0,Donald Trump just couldn t wish all Americans ...,News,0
1,House Intelligence Committee Chairman Devin Nu...,News,0
2,"On Friday, it was revealed that former Milwauk...",News,0
3,"On Christmas day, Donald Trump announced that ...",News,0
4,Pope Francis used his annual Christmas Day mes...,News,0


In [36]:
data = data.drop(["subject"],axis=1)

In [37]:
data.head()

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [38]:
data.reset_index(inplace=True)

In [39]:
data.drop(['index'],axis = 1,inplace=True)

In [40]:
data.sample(5)

Unnamed: 0,text,class
23425,"21st Century Wire says For years, the United N...",0
39003,RIYADH (Reuters) - A campaign of mass arrests ...,1
4513,A member of Donald Trump s presidential campai...,0
44474,KIEV (Reuters) - Ukrainian prosecutors have en...,1
34716,(Reuters) - Republican presidential candidate ...,1


In [41]:
def clean(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\W', ' ', text)
    return text

In [42]:
data["text"]=data["text"].apply(clean)

In [43]:
x=data["text"]
y=data["class"]

xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.25,random_state=42)

In [44]:
vectirizer = TfidfVectorizer()
xv_train = vectirizer.fit_transform(xtrain)
xv_test = vectirizer.transform(xtest)

## Logistic Regression


In [45]:
print("Training Logistic Regression...\n")

lr_model = LogisticRegression()
lr_model.fit(xv_train,ytrain)

prediction = lr_model.predict(xv_test)
lr_model.score(xv_test,ytest)
pred_probability = lr_model.predict_proba(xv_test)[:, 1]

# Evaluation
print("="*60)
print("LOGISTIC REGRESSION RESULTS")
print("="*60)
print(f"Accuracy: {accuracy_score(ytest, prediction):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(ytest, pred_probability):.4f}")
print("\nClassification Report:")
print(classification_report(ytest,prediction))

Training Logistic Regression...

LOGISTIC REGRESSION RESULTS
Accuracy: 0.9877
ROC-AUC Score: 0.9980

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5895
           1       0.99      0.99      0.99      5330

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



## Naive Bayes

In [46]:
from sklearn.naive_bayes import MultinomialNB

print("Training Naive Bayes...\n")

#Train model
nb_model = MultinomialNB()
nb_model.fit(xv_train,ytrain)

# Prediction
prediction = nb_model.predict(xv_test)
pred_probability = nb_model.predict_proba(xv_test)[:, 1]

# Evaluation
print("="*60)
print("NAIVE BAYES RESULTS")
print("="*60)
print(f"Accuracy: {accuracy_score(ytest, prediction):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(ytest, pred_probability):.4f}")
print("\nClassification Report:")
print(classification_report(ytest, prediction))


Training Naive Bayes...

NAIVE BAYES RESULTS
Accuracy: 0.9351
ROC-AUC Score: 0.9825

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      5895
           1       0.94      0.92      0.93      5330

    accuracy                           0.94     11225
   macro avg       0.94      0.93      0.93     11225
weighted avg       0.94      0.94      0.94     11225



## Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier

print("Training Random Forest... (This may take a few minutes)\n")

# Train model
# n_estimators: Number of trees in the forest
# max_depth: Maximum depth of each tree (prevents overfitting)
# min_samples_split: Minimum samples required to split a node
rf_model = RandomForestClassifier(n_estimators=100, max_depth=50,min_samples_split=5,random_state=42,n_jobs=-1)

rf_model.fit(xv_train,ytrain)

prediction = rf_model.predict(xv_test)
pred_probability = rf_model.predict_proba(xv_test)[:, 1]

# Evaluation
print("="*60)
print("RANDOM FOREST RESULTS")
print("="*60)
print(f"Accuracy: {accuracy_score(ytest, prediction):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(ytest, pred_probability):.4f}")
print("\nClassification Report:")
print(classification_report(ytest, prediction))


Training Random Forest... (This may take a few minutes)

RANDOM FOREST RESULTS
Accuracy: 0.9843
ROC-AUC Score: 0.9984

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5895
           1       0.99      0.98      0.98      5330

    accuracy                           0.98     11225
   macro avg       0.98      0.98      0.98     11225
weighted avg       0.98      0.98      0.98     11225



In [None]:
import joblib

joblib.dump(lr_model, "lr_model.pkl")
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(nb_model, "nb_model.pkl")
joblib.dump(vectirizer,'vectorizer.pkl')


['vectorizer.pkl']