In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd

import numpy as np
from sklearn import preprocessing

import re, string
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pickle as pkl #Wrap object to use later
import tqdm as tqdm # make your loops show a smart progress meter
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

from sklearn import metrics


In [2]:
df_train=pd.read_csv("Constraint_English_Train.csv")
df_test=pd.read_csv("Constraint_English_Test.csv")

Our purpose is to pickle every object needed for the gui <br>
- using only one Model for the test for the GUI : LR

In [3]:
df_train.head(3)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake


In [4]:
df_train=df_train.drop(['id'],axis=1)

In [5]:
df_train.head(3)

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,real
1,States reported 1121 deaths a small rise from ...,real
2,Politically Correct Woman (Almost) Uses Pandem...,fake


In [6]:
df_train["label"].value_counts()

real    3360
fake    3060
Name: label, dtype: int64

#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [7]:
def word_drop(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [8]:
df_train["tweet"] = df_train["tweet"].apply(word_drop)

In [9]:
df_train.head(3)

Unnamed: 0,tweet,label
0,the cdc currently reports deaths in general ...,real
1,states reported deaths a small rise from last...,real
2,politically correct woman almost uses pandem...,fake


- Mapping as fake , reel with {0,1}

In [10]:
#use map to code label : true as 1 / fake as 0
df_train['label'] = df_train['label'].map({'real': 1,'fake': 0})

In [11]:
df_train.head(3)

Unnamed: 0,tweet,label
0,the cdc currently reports deaths in general ...,1
1,states reported deaths a small rise from last...,1
2,politically correct woman almost uses pandem...,0


### starting first with countVectorizer

In [12]:
df_train["tweet"].head(3)

0    the cdc currently reports  deaths  in general ...
1    states reported  deaths a small rise from last...
2    politically correct woman  almost  uses pandem...
Name: tweet, dtype: object

In [13]:
corpus = df_train["tweet"]

In [14]:
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=1000,
                             ngram_range=(1, 2),
                             min_df=1)

In [15]:
X = vectorizer.fit_transform(corpus).toarray()

In [16]:
print(X.shape)

(6420, 1000)


In [17]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [18]:
# Names of Vectorized Features
vectorizer.get_feature_names()[:25]

['abia',
 'about',
 'about the',
 'access',
 'access to',
 'according',
 'according to',
 'across',
 'across the',
 'act',
 'active',
 'active cases',
 'additional',
 'affected',
 'after',
 'again',
 'against',
 'against covid',
 'age',
 'air',
 'akwa',
 'akwa ibom',
 'alert',
 'alexismadrigal',
 'all']

In [19]:
print("corpus size: %s" % len(vectorizer.get_feature_names()))

corpus size: 1000


In [20]:
# Building a Map of Categories =Making Categories Numerical since ML understands numbers better
categories = df_train["label"].unique()
category_dict = {value:index for index, value in enumerate(categories)}
results = df_train["label"].map(category_dict)
category_dict

{1: 0, 0: 1}

In [21]:
results

0       0
1       0
2       1
3       0
4       0
       ..
6415    1
6416    1
6417    1
6418    1
6419    0
Name: label, Length: 6420, dtype: int64

In [22]:
# Split Dataset into Test and Training Data
x_train,x_test, y_train,y_test = train_test_split(X, results, test_size=0.2)

In [23]:
# Using LR Multinomial Classifier
LR = LogisticRegression(max_iter=1000)
LR.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

In [24]:
print("Accuracy of our model score: ",LR.score(x_test, y_test))

Accuracy of our model score:  0.9042056074766355


In [25]:
LR.predict(x_test)

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [26]:
category_dict 
#1: reel \ 0: fake

{1: 0, 0: 1}

In [27]:
text = ["Russian Hackers hijack US Election "]

In [28]:
# Vectorize and Transform text
vec_text = vectorizer.transform(text).toarray()

In [29]:
#Predict
LR.predict(vec_text)

array([1], dtype=int64)

In [30]:
# A function to do it
def classclass(a):
    test_name1 = [a]
    transform_vect =vectorizer.transform(text).toarray()
    if LR.predict(transform_vect) == 0:
        print("FAKE")
    else:
        print("REEL")

In [31]:
classclass("shit ")

REEL


In [32]:
clf = MultinomialNB()
clf.fit(x_train, y_train)

MultinomialNB()

In [33]:
# A function to do it
def classclass(a):
    test_name1 = [a]
    transform_vect =vectorizer.transform(text).toarray()
    if clf.predict(transform_vect) == 0:
        print("FAKE")
    else:
        print("REEL")

In [34]:
classclass("A post claims compulsory vacination violates the principles of bioethics, that coronavirus doesnÃ¢â‚¬â„¢t exist, that the PCR test returns many false positives, and that influenza vaccine is related to COVID-19. ")

REEL


try with vectorizer

In [35]:
vectorizer1 = TfidfVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=1000,
                             ngram_range=(1, 2),
                             min_df=1)

In [36]:
X1 = vectorizer1.fit_transform(corpus).toarray()

In [37]:
print(X1.shape)

(6420, 1000)


In [38]:
vectorizer1.get_feature_names()[:25]

['abia',
 'about',
 'about the',
 'access',
 'access to',
 'according',
 'according to',
 'across',
 'across the',
 'act',
 'active',
 'active cases',
 'additional',
 'affected',
 'after',
 'again',
 'against',
 'against covid',
 'age',
 'air',
 'akwa',
 'akwa ibom',
 'alert',
 'alexismadrigal',
 'all']

In [39]:
# Split Dataset into Test and Training Data
x_train,x_test, y_train,y_test = train_test_split(X1, results, test_size=0.2)

In [40]:
LR = LogisticRegression(max_iter=1000)
LR.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

In [41]:
LR.predict(x_test)

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [42]:
# A function to do it
def classclass1(a):
    test_name1 = [a]
    transform_vect =vectorizer1.transform(text).toarray()
    if LR.predict(transform_vect) == 0:
        print("FAKE")
    else:
        print("REEL")

In [43]:
classclass1("A post claims compulsory vacination violates the principles of bioethics, that coronavirus doesnÃ¢â‚¬â„¢t exist, that the PCR test returns many false positives, and that influenza vaccine is related to COVID-19. ")

REEL


In [44]:
pickle.dump(vectorizer, open("vector.pkl", "wb"))

In [45]:
def findlabel(newtext):
    vec_newtest=vectorizer1.transform([newtext])
    y_pred1=LR.predict(vec_newtest)
    return y_pred1[0]

In [46]:
findlabel((df_test['tweet'][1]))
#findlabel(("A post claims compulsory vacination violates the principles of bioethics, that coronavirus doesnÃ¢â‚¬â„¢t exist, that the PCR test returns many false positives, and that influenza vaccine is related to COVID-19. ")) 
#0 => fake
#1 => true

1

In [47]:
findlabel((" covid isn't bad ")) 

1

In [48]:
df_test.head(3)

Unnamed: 0,id,tweet,label
0,1,Our daily update is published. States reported...,real
1,2,Alfalfa is the only cure for COVID-19.,fake
2,3,President Trump Asked What He Would Do If He W...,fake


In [49]:
pickle.dump(vectorizer1, open("vector1.pkl", "wb"))

In [None]:
# Save the Modle to file in the current working directory

Pkl_Filename = "Pickle_LR_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(LR_Model, file)

In [50]:
pickle.dump(LR, open("Pickle_LR_Model.pkl", "wb"))