In [1]:
import numpy as np
import pandas as pd
import itertools
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
test = pd.read_csv('mediaeval-2015-testset.txt',sep='\t')
train = pd.read_csv('mediaeval-2015-trainingset.txt',sep='\t')
test = test.set_index('tweetId', drop = True)



In [3]:
# Counting number of rows and columns in the data
print('Shape of Training Data: ', train.shape)

# Gettiing a hang of the data in each column and their names
print('\n \n TRAIN \n', train.head())
print('\n \n TEST \n', test.head())

# Looking for any places where training data has NaN values
print('\n \nNumber of Null values in Train Set: ', train['tweetText'].isna().sum())
print('Number of Null values in Test Set: ', test['tweetText'].isna().sum())

# Dropping all rows where text column is NaN
train.dropna(axis=0, how="any", thresh=None, subset=['tweetText'], inplace=True)
test = test.fillna(' ')

Shape of Training Data:  (14277, 7)

 
 TRAIN 
               tweetId                                          tweetText  \
0  263046056240115712  ¿Se acuerdan de la película: “El día después d...   
1  262995061304852481  @milenagimon: Miren a Sandy en NY!  Tremenda i...   
2  262979898002534400  Buena la foto del Huracán Sandy, me recuerda a...   
3  262996108400271360     Scary shit #hurricane #NY http://t.co/e4JLBUfH   
4  263018881839411200  My fave place in the world #nyc #hurricane #sa...   

      userId      imageId(s)        username                       timestamp  \
0   21226711  sandyA_fake_46         iAnnieM  Mon Oct 29 22:34:01 +0000 2012   
1  192378571  sandyA_fake_09  CarlosVerareal  Mon Oct 29 19:11:23 +0000 2012   
2  132303095  sandyA_fake_09     LucasPalape  Mon Oct 29 18:11:08 +0000 2012   
3  241995902  sandyA_fake_29     Haaaaarryyy  Mon Oct 29 19:15:33 +0000 2012   
4  250315890  sandyA_fake_15  princess__natt  Mon Oct 29 20:46:02 +0000 2012   

  label  
0  f

In [4]:
# Checking length of each article
length = []
[length.append(len(str(text))) for text in train['tweetText']]
train['length'] = length
print('Minimum Length: ', min(train['length']), '\nMaximum Length: ', max(train['length']), '\nAverage Length: ', round(sum(train['length'])/len(train['length'])))

Minimum Length:  26 
Maximum Length:  7125 
Average Length:  92


In [5]:
# Minimum length is 1. We need to spot some outliers and get rid of them. Counting how many outliers are there
print('Number of articles with more than 250 words: ', len(train[train['length'] >140]))
# Skimming through such short texts just to be sure
print(train['length'][train['length'] > 140] )

                                                   

Number of articles with more than 250 words:  135
486       142
571      1658
773       142
1801      142
1803     4786
         ... 
14254     146
14255     145
14258     144
14266     144
14268     149
Name: length, Length: 135, dtype: int64


In [6]:
# Removing outliers, it will reduce overfitting
train = train.drop(train['tweetText'][train['length'] > 140].index, axis = 0)
print('Minimum Length: ', min(train['length']), '\nMaximum Length: ', max(train['length']), '\nAverage Length: ', round(sum(train['length'])/len(train['length'])))

Minimum Length:  26 
Maximum Length:  140 
Average Length:  88


In [7]:
# Secluding labels in a new pandas dataframe for supervised learning
train_labels = train['label']
# Splitting data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(train['tweetText'], train_labels, test_size=0.1, random_state=0)

In [8]:
# Setting up Term Frequency - Inverse Document Frequency Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english', max_df = 0.7)
# Fit and transform training set and transform test set
tfidf_train = tfidf.fit_transform(x_train) 
tfidf_test = tfidf.transform(x_test)
tfidf_test_final = tfidf.transform(test['tweetText'])

In [24]:
# Setting up Passive Aggressive Classifier
pac = PassiveAggressiveClassifier(max_iter = 50)
# Fitting on the training set
pac.fit(tfidf_train, y_train)
# Predicting on the test set
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score * 100, 2)}%')

Accuracy: 84.41%


In [10]:
data = train.copy()

In [11]:
data.drop(["tweetId"],axis=1,inplace=True)
data.drop(["userId"],axis=1,inplace=True)

data.drop(["imageId(s)"],axis=1,inplace=True)

data.drop(["timestamp"],axis=1,inplace=True)

data.head()

Unnamed: 0,tweetText,username,label,length
0,¿Se acuerdan de la película: “El día después d...,iAnnieM,fake,134
1,@milenagimon: Miren a Sandy en NY! Tremenda i...,CarlosVerareal,fake,133
2,"Buena la foto del Huracán Sandy, me recuerda a...",LucasPalape,fake,116
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,Haaaaarryyy,fake,46
4,My fave place in the world #nyc #hurricane #sa...,princess__natt,fake,89


In [12]:
data.shape

(14142, 4)

In [15]:
from langdetect import detect


def det(x):
    try:
        lang = detect(x)
    except:
        lang = 'Other'
    return lang

data['Lang'] = data['tweetText'].apply(det)

In [16]:
indexNames = data[ (data['Lang'] != "en")].index
data.drop(indexNames , inplace=True)

In [31]:
# Secluding labels in a new pandas dataframe for supervised learning
train_labels = data['label']
# Splitting data into training and testing sets
x_train1, x_test1, y_train1, y_test1 = train_test_split(data['tweetText'], train_labels, test_size=0.1, random_state=0)

In [32]:
# Setting up Term Frequency - Inverse Document Frequency Vectorizer
tfidf = TfidfVectorizer(stop_words = 'english', max_df = 0.7)
# Fit and transform training set and transform test set
tfidf_train1 = tfidf.fit_transform(x_train1) 
tfidf_test1 = tfidf.transform(x_test1)
tfidf_test_final1 = tfidf.transform(test['tweetText'])

In [33]:
# Setting up Passive Aggressive Classifier
pac = PassiveAggressiveClassifier(max_iter = 50)

# Fitting on the training set
pac.fit(tfidf_train1, y_train1)

# Predicting on the test set
y_pred1 = pac.predict(tfidf_test1)
score = accuracy_score(y_test1, y_pred1)
print(f'Accuracy: {round(score * 100, 2)}%')

Accuracy: 84.32%


# new analysis

In [50]:
# Change the labels
data.loc[(data['label'] == 0) , ['label']] = 'fake'
data.loc[(data['label'] == 1) , ['label']] = 'real'

In [49]:
data['label'].replace("humor", "fake",inplace=True)



In [53]:
# Isolate the labels
labels = data.label
labels.head()

3    fake
4    fake
5    fake
6    fake
7    fake
Name: label, dtype: object

In [66]:
#Split the dataset
x_train,x_test,y_train,y_test=train_test_split(data['tweetText'].values.astype('str'), labels, test_size=0.2, random_state=7)


In [67]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

In [68]:
# Fit & transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [69]:
# Initialize the PassiveAggressiveClassifier and fit training sets
pa_classifier=PassiveAggressiveClassifier(max_iter=50)
pa_classifier.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [70]:
# Predict and calculate accuracy
y_pred=pa_classifier.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')


Accuracy: 91.14%


In [71]:
# Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['fake','real'])

array([[1285,   67],
       [ 125,  691]])

In [73]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', RandomForestClassifier(n_estimators=50, criterion="entropy"))])
model = pipe.fit(x_train, y_train)
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 89.21%


In [75]:
# Vectorizing and applying TF-IDF
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])
# Fitting the model
model = pipe.fit(x_train, y_train)
# Accuracy
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 90.91%


In [76]:
from sklearn.tree import DecisionTreeClassifier
# Vectorizing and applying TF-IDF
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = 20, 
                                           splitter='best', 
                                           random_state=42))])
# Fitting the model
model = pipe.fit(x_train, y_train)
# Accuracy
prediction = model.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

accuracy: 77.26%
