# 1. Prerequisites

In [1]:
#Needed libraries
import numpy as np
import pandas as pd
import sklearn as sk

print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

Numpy version: 1.26.0
Pandas version: 2.1.1
Scikit version: 1.2.2


In [2]:
#Loading datasets
from pathlib import Path

def loadTestData():
    testPath = Path("Datasets/mediaeval-2015-testset.txt")
    if not testPath.is_file():
        print("Testing data was not found")
        return
    return pd.read_csv(testPath, sep='\t')

def loadTrainData():
    trainPath = Path("Datasets/mediaeval-2015-trainingset.txt")
    if not trainPath.is_file():
        print("Training data was not found")
        return
    return pd.read_csv(trainPath, sep='\t')


trainData = loadTrainData()
testData = loadTestData()

# 2. Data Analysis

In [3]:
trainData.info()
trainData

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     14277 non-null  int64 
 1   tweetText   14277 non-null  object
 2   userId      14277 non-null  int64 
 3   imageId(s)  14277 non-null  object
 4   username    14277 non-null  object
 5   timestamp   14277 non-null  object
 6   label       14277 non-null  object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB


Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake
...,...,...,...,...,...,...,...
14272,443231991593304064,@BobombDom *slaps TweetDeck with the PigFish h...,2179310905,pigFish_01,Da_Vault_Hunter,Tue Mar 11 03: 48: 36 +0000 2014,fake
14273,443086239127076865,New Species of Fish found in Brazil or just Re...,254843101,pigFish_01,DjSituation_RC,Mon Mar 10 18: 09: 26 +0000 2014,fake
14274,442978105238753280,What do we call this? #pigFISH http: \/\/t.co\...,2367553228,pigFish_01,Vivo1Vuyo,Mon Mar 10 10: 59: 45 +0000 2014,fake
14275,442753479782989824,Pigfish ? E dopo il pescecane c'è il pesce mai...,603120231,pigFish_01,CosimoTarta,Sun Mar 09 20: 07: 10 +0000 2014,fake


For some reason, some tweets have image links that are in the format http: \/\/t.co\/(link). Some text is not in english (should it be translated?). Some timestamps are not in the right format and have spaces between them. There are many emojis. Some typos of words. Many special characters. 

In [19]:
trainData["label"].value_counts()

label
fake     6742
real     4921
humor    2614
Name: count, dtype: int64

There are three labels, fake, real and humor. Humor should be classified as fake so it will be changed. You could keep the humor label in a new column or do something with it for extra data, but for now it will just be converted to fake.

In [5]:
print(trainData["tweetId"].value_counts())
trainData[trainData["tweetId"] == 264736470089216000]
#trainData[trainData["tweetId"] == 263351427320131584]

tweetId
264736470089216000    2
263351427320131584    2
263046056240115712    1
263019084772421632    1
263427804975206401    1
                     ..
263376004754583552    1
263029335877906432    1
262998664031649792    1
263029070210686977    1
442700377860104192    1
Name: count, Length: 14275, dtype: int64


Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
2439,264736470089216000,Lower Manhattan's power is gone. Pretty eerie ...,2675041,sandyB_fake_11,cDima,Sat Nov 03 14:31:07 +0000 2012,fake
11947,264736470089216000,Lower Manhattan's power is gone. Pretty eerie ...,2675041,sandyB_real_54,cDima,Sat Nov 03 14:31:07 +0000 2012,real


There are two tweets with the same tweetId? Not sure how this is possible?

# 3. Preprocessing

All the categorical values need to be converted to some numerical value. Both fake and humour will be given the same categorical value of 1, with real being assigned 0. The hot encoder is currently unused but may be tested later.

In [5]:
from sklearn.preprocessing import OneHotEncoder

def convertCategoriesToNumerical(dataset):
    encoded_cat, categories = dataset["label"].factorize() # retrieve the attribute encoded as numbers
    encoded_cat_arr = OneHotEncoder().fit_transform(encoded_cat.reshape(-1,1)).toarray() # transform sparse matrix to NumPy array
    enc_train_data = dataset.iloc[:,0:9].copy()
    for i in range(0, len(categories)):
        enc_train_data[categories[i]] = encoded_cat_arr[:,i]
    enc_train_data = enc_train_data.drop('label', axis=1)
    return enc_train_data

def convertLabelToNumerical(dataset):
    dataset['label'] = dataset['label'].map({'fake': 1, 'humor': 1, 'real': 0})
    return dataset

#trainData = convertCategoriesToNumerical(trainData)
trainData = convertLabelToNumerical(trainData)
testData = convertLabelToNumerical(testData)

Next we need to check for any invalid data. For example if the length of a tweet is too long (having more than 280 characters). If the date is invalid. If the links to twitter posts are invalid. Duplicate tweet IDs.

# 4. Training

In [6]:
from sklearn.svm import SVR

train_set2 = trainData
test_set2 = testData

svm_poly_reg = SVR(
  kernel="poly",
  degree=2,
  C=100,
  epsilon=0.1)

svm_poly_reg.fit(train_set2.iloc[:, [idx for idx in range(len(train_set2.columns)) if idx != 6]], train_set2['label'])
prediction = frst_regressor2.predict(test_set2.iloc[:, [idx for idx in range(len(test_set2.columns)) if idx != 6])
print('RMSE = ', np.sqrt(mean_squared_error(test_set2['label'], prediction)))

SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (598755252.py, line 13)

In [None]:
train_set2 = trainData
test_set2 = testData

from sklearn.ensemble import RandomForestRegressor

frst_regressor2 = RandomForestRegressor()
frst_regressor2.fit(train_set2.iloc[:, [idx for idx in range(len(train_set2.columns)) if idx != 6]], train_set2['label'])
prediction4 = frst_regressor2.predict(test_set2.iloc[:, [idx for idx in range(len(test_set2.columns)) if idx != 6]])



# 4. Evaluation