We start by doing preproccesing!

This works for both the the large csv and the smaller one.

In [4]:

import re
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk import *
from nltk.tokenize import word_tokenize

porter_stemmer = SnowballStemmer("english")
nltk.download('punkt')

df = pd.read_csv('Data/bigdata/big.csv')

def stem_sentence(sentence):
    token_words = word_tokenize(sentence)
    stemmed_sentence = [porter_stemmer.stem(word) for word in token_words]
    return " ".join(stemmed_sentence)

if type(df['content']) == str:
    df['content'] = df['content'].apply(stem_sentence)
else:
    print("womp")

df['content'].replace(regex={'\n{2,}': "" }, inplace=True)
df['content'].replace(regex={'\t{2,}': "" }, inplace=True)
df['content'].replace(regex={'\r{2,}': "" }, inplace=True)
print("whitespace is gone")
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
print("lowercase")
df['content'].replace(regex={'https://[.a-zA-Z0-9/-]+|www[.a-zA-Z0-9/-]+|http://[.a-zA-Z0-9/-]+': '<URL>'}, inplace=True)
print("URLS gone")
df['content'].replace(regex={'[a-z]+@[a-z]+.[a-z]+': '<EMAIL>'}, inplace=True)
print("EMAILS GONE")
df['content'].replace(regex={'([0-9]{2})[/]?[-]?([0-9]{2})[/]?[-]?([0-9]{4})|([0-9]{4})[/]?[-]?([0-9]{2})[/]?[-]?([0-9]{2})': '<DATE>'}, inplace=True)
print("DATE GONE")
df['content'].replace(regex={'[0-9]+': '<NUM>'}, inplace=True)
print("NUM GONE")
df.to_csv('Data/otherdata/SOMETHING.csv', index=True)
print("DONE")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  df = pd.read_csv('Data/bigdata/big.csv')


womp


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['content'].replace(regex={'\n{2,}': "" }, inplace=True)


whitespace is gone


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


lowercase


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['content'].replace(regex={'https://[.a-zA-Z0-9/-]+|www[.a-zA-Z0-9/-]+|http://[.a-zA-Z0-9/-]+': '<URL>'}, inplace=True)


URLS gone
EMAILS GONE
DATE GONE
NUM GONE
DONE


We also quickly want to group the labels of the dataset, so that there are only 2 labels one for realnews and one for fakenews.

In [5]:

import pandas as pd

# Load data from CSV file with specified data types
df = pd.read_csv('Data/otherdata/SOMETHING.csv')

# Define a function to map types to categories
def map_type(Thetype):
    if Thetype in ['fake', 'bias', 'hate', 'conspiracy', 'junksci', 'satire', 'state']:
        return 'FakeNews'
    elif Thetype in ['reliable', 'political']:
        return 'RealNews'
    else:
        return None

df['type'] = df['type'].apply(map_type)

something = df.drop(df[df['type'].isnull()].index)
LastSomething = something.drop(something[something['content'].isnull()].index)

LastSomething.to_csv('Data/otherdata/BIGCLEANED.csv', index=False)


  df = pd.read_csv('Data/otherdata/SOMETHING.csv')


Now that we have a preproccesed dataset we can now start building our base model.

In [6]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from sklearn.metrics import accuracy_score
import pandas as pd
import re
from sklearn.metrics import f1_score

df = pd.read_csv('Data/otherdata/BIGCLEANED.csv')

def count_words(text):
    numbers = re.findall(r'\d+(?:\.\d+)?', text)
    return len(numbers)

def calculate_length(obj):
    if isinstance(obj, str):
        return len(obj)
    else:
        return None

X = df['content'].apply(calculate_length)
Y = df["type"].dropna()  

X_train, X_combined_test, y_train, y_combined_test = train_test_split(X, Y, test_size=0.2, random_state=32)


X_test1, X_test2, y_test1, y_test2 = train_test_split(X_combined_test, y_combined_test, test_size=0.5, random_state=32 )
basemodel = LogisticRegression()

# Train the model on the training data
basemodel.fit(X_train.values.reshape(-1, 1), y_train)

# Predict the labels for the testing data
y_pred = basemodel.predict(X_test1.values.reshape(-1, 1))

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test1, y_pred)
f1score = f1_score(y_test1, y_pred, average='macro')
print(f'Accuracy (F1score){f1score}')
print("Accuracy:", accuracy)

dump(basemodel, 'basemodel.joblib')


Accuracy (F1score)0.34847852579072813
Accuracy: 0.5289296230672157


['basemodel.joblib']

We want to add meta-data to the model, so we will be traning a new model with the title meta-data, the resoning is shown in the report.

But to do this we also need to remove all the articles with missing title values.

In [7]:

import pandas as pd

# Load data from CSV file with specified data types
df = pd.read_csv('Data/otherdata/BIGCLEANED.csv')

# Define a function to map types to categories
def map_type(Thetype):
    if Thetype in ['fake', 'bias', 'hate', 'conspiracy', 'junksci', 'satire', 'state']:
        return 'FakeNews'
    elif Thetype in ['reliable', 'political']:
        return 'RealNews'
    elif Thetype in ['RealNews', 'FakeNews']:
        return Thetype
    else:
        return None

df['type'] = df['type'].apply(map_type)

something = df.drop(df[df['type'].isnull()].index)
LastSomething = something.drop(something[something['content'].isnull()].index)
LastLastSomething = LastSomething.drop(LastSomething[LastSomething['title'].isnull()].index) 
LastLastSomething.to_csv('Data/otherdata/Trainedwithtittle.csv', index=False)

"import pandas as pd\n\n# Load data from CSV file with specified data types\ndf = pd.read_csv('Data/otherdata/BIGCLEANED.csv')\n\n# Define a function to map types to categories\ndef map_type(Thetype):\n    if Thetype in ['fake', 'bias', 'hate', 'conspiracy', 'junksci', 'satire', 'state']:\n        return 'FakeNews'\n    elif Thetype in ['reliable', 'political']:\n        return 'RealNews'\n    elif Thetype in ['RealNews', 'FakeNews']:\n        return Thetype\n    else:\n        return None\n\ndf['type'] = df['type'].apply(map_type)\n\nsomething = df.drop(df[df['type'].isnull()].index)\nLastSomething = something.drop(something[something['content'].isnull()].index)\nLastLastSomething = LastSomething.drop(LastSomething[LastSomething['title'].isnull()].index) \nLastLastSomething.to_csv('Data/otherdata/Trainedwithtittle.csv', index=False)"

In [8]:


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from sklearn.metrics import accuracy_score
import pandas as pd
import re
from sklearn.metrics import f1_score

df = pd.read_csv('Data/otherdata/Trainedwithtittle.csv')

def count_words(text):
    numbers = re.findall(r'\d+(?:\.\d+)?', text)
    return len(numbers)

def calculate_length(obj):
    if isinstance(obj, str):
        return len(obj)
    else:
        return None

X = df['content'].apply(count_words) + df['title'].apply(count_words)
Y = df["type"].dropna()  

X_train, X_combined_test, y_train, y_combined_test = train_test_split(X, Y, test_size=0.2, random_state=32)


X_test1, X_test2, y_test1, y_test2 = train_test_split(X_combined_test, y_combined_test, test_size=0.5, random_state=32 )
basemodel = LogisticRegression()

# Train the model on the training data
basemodel.fit(X_train.values.reshape(-1, 1), y_train)

# Predict the labels for the testing data
y_pred = basemodel.predict(X_test1.values.reshape(-1, 1))

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test1, y_pred)
f1score = f1_score(y_test1, y_pred, average='macro')
print(f'Accuracy (F1score){f1score}')
print("Accuracy:", accuracy)

dump(basemodel, 'basemodelwithtitle.joblib')


FileNotFoundError: [Errno 2] No such file or directory: 'Data/otherdata/Trainedwithtittle.csv'

Now that we have trained that we will also look at the results.

Firstly we look at the results of the simple model without meta-data.

In [None]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import load, dump
from sklearn.metrics import accuracy_score
import pandas as pd
import collections, numpy
from sklearn.metrics import f1_score
import re
def count_words(text):
    numbers = re.findall(r'\d+(?:\.\d+)?', text)
    return len(numbers)

df = pd.read_csv('Data/otherdata/BIGCLEANED.csv')
X = df["content"].dropna().apply(count_words)
y = df["type"].dropna()
X = X[:len(y)]

# Split the data into one training set and one combined test set
X_train, X_combined_test, y_train, y_combined_test = train_test_split(X, y, test_size=0.2, random_state=32)

# Split the combined test set into two separate test sets
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_combined_test, y_combined_test, test_size=0.5, random_state=32 )

loaded_model = load('basemodel.joblib')
loaded_model.fit(X_train.values.reshape(-1, 1), y_train)
# Make predictions using the loaded model
predictions = loaded_model.predict(X_test2.values.reshape(-1, 1))

print("BASE MODEL WITHOUT TITLE")
# Print the predictions
f1score = f1_score(y_test2, predictions, average='macro')
test_accuracy = accuracy_score(y_test2, predictions)
print(f'Accuracy (F1score){f1score}')
print(f'Accuracy (Not f1score){test_accuracy}')




BASE MODEL WITHOUT TITLE
Accuracy (F1score)0.3453329962775426
Accuracy (Not f1score)0.5273234840469605


Now we look at the simple model using meta-data.

In [None]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import load, dump
from sklearn.metrics import accuracy_score
import pandas as pd
import collections, numpy
from sklearn.metrics import f1_score
import re
def count_words(text):
    numbers = re.findall(r'\d+(?:\.\d+)?', text)
    return len(numbers)

df = pd.read_csv('Data/otherdata/Trainedwithtittle.csv')
X = df["content"].dropna().apply(count_words) + df["title"].dropna().apply(count_words)
y = df["type"].dropna()  
X = X[:len(y)]

# Split the data into one training set and one combined test set
X_train, X_combined_test, y_train, y_combined_test = train_test_split(X, y, test_size=0.2, random_state=32)

# Split the combined test set into two separate test sets
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_combined_test, y_combined_test, test_size=0.5, random_state=32 )

loaded_model = load('basemodel.joblib')
loaded_model.fit(X_train.values.reshape(-1, 1), y_train)
# Make predictions using the loaded model
predictions = loaded_model.predict(X_test2.values.reshape(-1, 1))

print("BASE MODEL WITH TITLE")
# Print the predictions
f1score = f1_score(y_test2, predictions, average='macro')
test_accuracy = accuracy_score(y_test2, predictions)
print(f'Accuracy (F1score){f1score}')
print(f'Accuracy (Not f1score){test_accuracy}')




BASE MODEL WITH TITLE
Accuracy (F1score)0.5144203342063822
Accuracy (Not f1score)0.5782792483954945


Now we will try add data we scapred from BBC and see how this updates the model.

First we want to clean the BBC dataset.

In [None]:
import re
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk import *
from nltk.tokenize import word_tokenize


porter_stemmer = SnowballStemmer("english")
nltk.download('punkt')

df = pd.read_csv('Data/otherdata/BBC.csv')

def stem_sentence(sentence):
    token_words = word_tokenize(sentence)
    stemmed_sentence = [porter_stemmer.stem(word) for word in token_words]
    return " ".join(stemmed_sentence)

df['text'].replace(regex={'\n{2,}': "" }, inplace=True)
df['text'].replace(regex={'\t{2,}': "" }, inplace=True)
df['text'].replace(regex={'\r{2,}': "" }, inplace=True)
print("whitespace is gone")
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
print("lowercase")
df['text'].replace(regex={'https://[.a-zA-Z0-9/-]+|www[.a-zA-Z0-9/-]+|http://[.a-zA-Z0-9/-]+': '<URL>'}, inplace=True)
print("URLS gone")
df['text'].replace(regex={'[a-z]+@[a-z]+.[a-z]+': '<EMAIL>'}, inplace=True)
print("EMAILS GONE")
df['text'].replace(regex={'([0-9]{2})[/]?[-]?([0-9]{2})[/]?[-]?([0-9]{4})|([0-9]{4})[/]?[-]?([0-9]{2})[/]?[-]?([0-9]{2})': '<DATE>'}, inplace=True)
print("DATE GONE")
df['text'].replace(regex={'[0-9]+': '<NUM>'}, inplace=True)
print("NUM GONE")
df.to_csv('Data/otherdata/CleanBBC.csv', index=True)
print("DONE")



[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].replace(regex={'\n{2,}': "" }, inplace=True)


whitespace is gone
lowercase


  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].replace(regex={'https://[.a-zA-Z0-9/-]+|www[.a-zA-Z0-9/-]+|http://[.a-zA-Z0-9/-]+': '<URL>'}, inplace=True)


URLS gone
EMAILS GONE
DATE GONE
NUM GONE
DONE


First we have to combine the BBC data with out dataset with title. There after we want to clean the data!

In [None]:
import re
import pandas as pd

df = pd.read_csv('Data/otherdata/Trainedwithtittle.csv')
gf = pd.read_csv('Data/otherdata/CleanBBC.csv')

gf = gf.rename(columns={'text': 'Content', 'headline': 'title'})
gf['type'] = 'RealNews'

selected_columns = gf[['Content', 'title', 'type']]

combined_df = pd.concat([df, selected_columns], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('Data/otherdata/Combined.csv', index=False)

Now we want to preprocess the data!

In [None]:
import pandas as pd

# Load data from CSV file with specified data types
df = pd.read_csv('Data/otherdata/Combined.csv')

# Define a function to map types to categories
def map_type(Thetype):
    if Thetype in ['fake', 'bias', 'hate', 'conspiracy', 'junksci', 'satire', 'state']:
        return 'FakeNews'
    elif Thetype in ['reliable', 'political']:
        return 'RealNews'
    elif Thetype in ['FakeNews','RealNews']:
        return Thetype
    else:
        return None

df['type'] = df['type'].apply(map_type)

something = df.drop(df[df['type'].isnull()].index)
LastSomething = something.drop(something[something['content'].isnull()].index)
LastLastSomething = LastSomething.drop(LastSomething[LastSomething['title'].isnull()].index) #Use when traning with tittle! 
LastSomething.to_csv('Data/otherdata/CleanedCombined.csv', index=False)

  df = pd.read_csv('Data/otherdata/Combined.csv')


Now we can train the model!

In [None]:


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from sklearn.metrics import accuracy_score
import pandas as pd
import re
from sklearn.metrics import f1_score

df = pd.read_csv('Data/otherdata/CleanedCombined.csv')

def count_words(text):
    numbers = re.findall(r'\d+(?:\.\d+)?', text)
    return len(numbers)

def calculate_length(obj):
    if isinstance(obj, str):
        return len(obj)
    else:
        return None

X = df['content'].apply(count_words) + df['title'].apply(count_words)

Y = df["type"].dropna()  

X_train, X_combined_test, y_train, y_combined_test = train_test_split(X, Y, test_size=0.2, random_state=32)


X_test1, X_test2, y_test1, y_test2 = train_test_split(X_combined_test, y_combined_test, test_size=0.5, random_state=32 )
basemodel = LogisticRegression()

# Train the model on the training data
basemodel.fit(X_train.values.reshape(-1, 1), y_train)

# Predict the labels for the testing data
y_pred = basemodel.predict(X_test1.values.reshape(-1, 1))

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test1, y_pred)
f1score = classification_report(y_test1, y_pred, average='macro')
print(f'Accuracy (F1score){f1score}')
print("Accuracy:", accuracy)
dump(basemodel, 'basemodelBBC.joblib')


FileNotFoundError: [Errno 2] No such file or directory: 'Data/otherdata/CleanedCombined.csv'

Now we will make a advanced model, we will use sk-learn.

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from sklearn.metrics import accuracy_score
import pandas as pd

df = pd.read_csv('Data/otherdata/BIGCLEANED.csv')
X = df["content"].dropna()
Y = df["type"].dropna()  
Y = Y[:len(X)]
print(len(X))
print(len(Y))

X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=32)

X_test, X_REALTEST, y_test, y_REALTEST = train_test_split(X_temp, y_temp, test_size=0.5, random_state=32)


vectorizer = TfidfVectorizer()

X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

model = LogisticRegression(solver="liblinear", multi_class="ovr")
model.fit(X_train_vectors, y_train)


dump(model, 'model.joblib')
dump(vectorizer, 'vectorizer.joblib')


784489
784489


['vectorizer.joblib']

Now we will also be trying to make a model using meta-data, once again it will be using title.

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from sklearn.metrics import accuracy_score
import pandas as pd

df = pd.read_csv('Data/otherdata/Trainedwithtittle.csv')
X = df["content"].dropna() + df["title"].dropna()
Y = df["type"].dropna()  
Y = Y[:len(X)]
print(len(X))
print(len(Y))

X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=32)

X_test, X_REALTEST, y_test, y_REALTEST = train_test_split(X_temp, y_temp, test_size=0.5, random_state=32)



vectorizer = TfidfVectorizer()

X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

model = LogisticRegression(solver="liblinear", multi_class="ovr")
model.fit(X_train_vectors, y_train)


dump(model, 'Advancedmodelwithtitle.joblib')
dump(vectorizer, 'Advancedvectorizerwithtitle.joblib')


775933
775933


['Advancedvectorizerwithtitle.joblib']

Now lets see the results of the models!

First the model without the title meta-data.

In [None]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import load, dump
from sklearn.metrics import accuracy_score
import pandas as pd
import collections, numpy
from sklearn.metrics import f1_score

df = pd.read_csv('Data/otherdata/BIGCLEANED.csv')

X = df["content"].dropna()
Y = df["type"].dropna()  

X = X[:len(Y)]

X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=32)

X_test, X_REALTEST, y_test, y_REALTEST = train_test_split(X_temp, y_temp, test_size=0.5, random_state=32)

vectorizer = load('vectorizer.joblib')
loaded_model = load('model.joblib')

# Vectorize the input data using the loaded vectorizer
New_test_vectors = vectorizer.transform(X_REALTEST)

# Make predictions using the loaded model
predictions = loaded_model.predict(New_test_vectors)

print("ADVANCED MODEL WITHOUT TITLE")
# Print the predictions
f1score = f1_score(y_REALTEST, predictions, average='macro')
test_accuracy = accuracy_score(y_REALTEST, predictions)
print(f'F1score: {f1score}')
print(f'Accuracy: {test_accuracy}')




ADVANCED MODEL WITHOUT TITLE
F1score: 0.8907844667053688
Accuracy: 0.8911139721347627


Now with the meta-data title.

In [None]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import load, dump
from sklearn.metrics import accuracy_score
import pandas as pd
import collections, numpy
from sklearn.metrics import f1_score

df = pd.read_csv('Data/otherdata/Trainedwithtittle.csv')
X = df["content"].dropna() + df["title"].dropna()
Y = df["type"].dropna()  
Y = Y[:len(X)]
print(len(X))
print(len(Y))

X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=32)

X_test, X_REALTEST, y_test, y_REALTEST = train_test_split(X_temp, y_temp, test_size=0.5, random_state=32)

vectorizer = load('Advancedvectorizerwithtitle.joblib')
loaded_model = load('Advancedmodelwithtitle.joblib')

# Vectorize the input data using the loaded vectorizer
New_test_vectors = vectorizer.transform(X_REALTEST)

# Make predictions using the loaded model
predictions = loaded_model.predict(New_test_vectors)

# Print the predictions
print("ADVANCED MODEL WITH TITLE")
f1score = f1_score(y_REALTEST, predictions, average='macro')
test_accuracy = accuracy_score(y_REALTEST, predictions)
print(f'F1score: {f1score}')
print(f'Accuracy: {test_accuracy}')




775933
775933
ADVANCED MODEL WITH TITLE
F1score: 0.8964756875459738
Accuracy: 0.8969250199757713


Lastly we will test our models on the Liar dataset!

To do this we firstly need to prepair the dataset by labeling the types and then we are ready.

In [None]:
import pandas as pd

df = pd.read_csv('Data/liardata/test.tsv', header=None, delimiter='\t')

# Define a function to map types to categories
def map_type(Thetype):
    if str(Thetype) in ['false', 'pants-fire', 'barely-true']:
        return 'FakeNews'
    if str(Thetype) in ['mostly-true', 'true']:
        return 'RealNews'
    else:
        return None  # Return None for types that don't match the conditions

df[1] = df[1].apply(map_type)

newting = df.drop(df[df[1].isnull()].index)

# Save the modified DataFrame to a new TSV file
newting.to_csv('Data/liardata/FinalTest.tsv', sep='\t', index=False, header=False)


Now we are ready to test our models on the Liar dataset!

We start with testing the advanced model.

In [None]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import load, dump
from sklearn.metrics import accuracy_score
import pandas as pd
import collections, numpy
from sklearn.metrics import f1_score
df = pd.read_csv('Data/liardata/FinalTest.tsv', sep='\t' , header=None)
#Load data from tsv test file
content = df.iloc[:, 2]
Realawnser = df.iloc[:, 1]
#Load models
vectorizer = load('vectorizer.joblib')
loaded_model = load('model.joblib')
#Transform content
New_test_vectors = vectorizer.transform(content)
predictions = loaded_model.predict(New_test_vectors)
f1score = f1_score(Realawnser, predictions, average='macro')
test_accuracy = accuracy_score(Realawnser, predictions)
print(f'Accuracy (F1score){f1score}')
print(f'Accuracy (Not f1score){test_accuracy}')




Accuracy (F1score)0.5279398801786861
Accuracy (Not f1score)0.5279441117764471


Now we will test the simple model.

In [None]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from joblib import load, dump
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
import re
import collections, numpy
from sklearn.metrics import f1_score

df = pd.read_csv('Data/liardata/FinalTest.tsv', sep='\t' , header=None)

def count_words(text):
    numbers = re.findall(r'\d+(?:\.\d+)?', text)
    return len(numbers)

#Load data from tsv test file
content = df.iloc[:, 2].apply(count_words)
Realawnser = df.iloc[:, 1]

loaded_model = load('basemodel.joblib')

content_reshaped = np.array(content).reshape(-1, 1)
predictions = loaded_model.predict(content_reshaped)
f1score = f1_score(Realawnser, predictions, average='macro')
test_accuracy = accuracy_score(Realawnser, predictions)
print(f'Accuracy (F1score){f1score}')
print(f'Accuracy (Not f1score){test_accuracy}')




Accuracy (F1score)0.35562700964630223
Accuracy (Not f1score)0.5518962075848304
