In [1]:
# Based on example at
# https://github.com/biplav-s/course-nl/blob/master/l9-ml-review/Classification%20-%20Fake%20news.ipynb

In [2]:
# Do import for dataframe
import pandas as pd

In [3]:
# We will have vector representation before we can do classification
# Do imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [4]:
# We will consider 1- and 2- gram
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))

# We will now do properly on test and test data

In [5]:
# Train data
df_train = pd.read_csv('../data/Constraint_English_Train.csv')
df_train.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [6]:
# A function to clean of white spaces and tokenize
def cleanText(df):
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    return df

In [7]:
df_train = cleanText(df_train)
df_train.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [8]:
# fit training data to the count vectorizer
train_counts = count_vectorizer.fit_transform(df_train['tweet'].values)

#fit the ngrams count to the tfidf transformers
train_tfidf = transformer.fit_transform(train_counts)

In [9]:
y_train = df_train['label'].values

In [10]:
y_train.shape

(6420,)

In [11]:
# Import for prediction
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [12]:
# Also try logistic regression
logreg = LogisticRegression(C=1e5)
logreg.fit(train_tfidf, y_train)

LogisticRegression(C=100000.0)

In [13]:
# Print accuracy on training data
print('Accuracy of Logreg classifier on training set: {:.2f}'.format(logreg.score(train_tfidf, y_train)))

Accuracy of Logreg classifier on training set: 1.00


In [14]:
# Test data
df_val = pd.read_csv('../data/Constraint_English_val.csv')
df_val.head()

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,fake
1,2,11 out of 13 people (from the Diamond Princess...,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real


In [15]:
df_val = cleanText(df_val)
df_val.head()

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,fake
1,2,11 out of 13 people (from the Diamond Princess...,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real


In [16]:
y_val = df_val['label'].values

In [17]:
## For validation data, we do transform but not fit
## - See: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# fit training data to the count vectorizer
val_counts = count_vectorizer.transform(df_val['tweet'].values)

#fit the ngrams count to the tfidf transformers
val_tfidf = transformer.transform(val_counts)

In [18]:
# Print accuracy on training data
print('Accuracy of Logreg classifier on training set: {:.2f}'.format(logreg.score(train_tfidf, y_train)))
# Print accuracy on val data
print('Accuracy of Logreg classifier on validation set: {:.2f}'.format(logreg.score(val_tfidf, y_val)))
# Print confustion matrix on validation data
CM = confusion_matrix(y_val, logreg.predict(val_tfidf))
print(CM)

Accuracy of Logreg classifier on training set: 1.00
Accuracy of Logreg classifier on validation set: 0.93
[[ 901  119]
 [  25 1095]]


In [19]:
## Now try Random Forest
from sklearn.ensemble import RandomForestClassifier

In [20]:
# Try titles on random forest
RandomFC= RandomForestClassifier(n_estimators=5)
RandomFC.fit(train_tfidf, y_train)

RandomForestClassifier(n_estimators=5)

In [21]:
# Print accuracy
print('Accuracy of randomforest classifier on training set: {:.2f}'.format(RandomFC.score(train_tfidf, y_train)))
print('Accuracy of randomforest classifier on test set: {:.2f}'.format(RandomFC.score(val_tfidf, y_val)))
CM = confusion_matrix(y_val, RandomFC.predict(val_tfidf))
print(CM)

Accuracy of randomforest classifier on training set: 0.99
Accuracy of randomforest classifier on test set: 0.87
[[879 141]
 [135 985]]


# Predicting for any text

In [29]:
def classifyText (text):
    data = [text]
    # Convert text to vector representation
    val_counts = count_vectorizer.transform(data)
    test_tfidf = transformer.transform(val_counts)
    # Now predict
    return logreg.predict(test_tfidf)

In [34]:
#testTweet = "This is a Trump tweet about covid"
testTweet = "WHO explains the results of the test"
result = classifyText(testTweet)
print (result)

['real']
