### Logistic Regression Model

In [1]:
#Importing modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report
import pandas as pd

In [None]:
#Loading the cleaned train_set to memory as df
train_df=pd.read_csv('full_train_cleaned.csv') 

In [None]:
#Loading the cleaned val_set to memory as df
val_df=pd.read_csv('full_val_cleaned.csv') 

In [4]:
#Checking distribution of types
train_df['broad_category'].value_counts()

broad_category
Fake News        363240
Reliable News    359703
Name: count, dtype: int64

Now we under-sample Fake News   

In [5]:
min_count = train_df['broad_category'].value_counts().min()

train_df = train_df.groupby('broad_category').apply(lambda x: x.sample(n=min_count, random_state=0)).reset_index(drop=True)

  train_df = train_df.groupby('broad_category').apply(lambda x: x.sample(n=min_count, random_state=0)).reset_index(drop=True)


In [7]:
#Checking distribution of types
train_df['broad_category'].value_counts()

broad_category
Fake News        359703
Reliable News    359703
Name: count, dtype: int64

In [8]:
#Defining the x-values and y-values to train the logistic regression.
x_train,y_train=train_df['content'],train_df['broad_category']

In [9]:
#Defining the x-values and y-values for the validation set.
x_val,y_val=val_df['content'],val_df['broad_category']

In [10]:
#Initializing CountVectorizer from the sklearn module with the 10.000 most common words
#This creates a matrix where the rows represent the article text and columns represent words. The value is how many times a word appear in the article.
vectorizer = CountVectorizer(max_features=10000)

In [11]:
#Fitting and transforming on the training set data
x_train = x_train.astype(str).fillna("")
x_train_vectorized = vectorizer.fit_transform(x_train)

In [12]:
#Transforming the validation data
x_val = x_val.astype(str).fillna("")
x_val_vectorized = vectorizer.transform(x_val)

In [13]:
#Logistic Regression model with choosen parameters
logreg=LogisticRegression(max_iter=10000,C=0.01)

#Fitting the Logistic Regression model on the training set
logreg.fit(x_train_vectorized,y_train)

In [14]:
#Using the trained model to predict categories for the validation data.
y_val_pred=logreg.predict(x_val_vectorized)
print("Validation set performance")
print(classification_report(y_val,y_val_pred))

Validation set performance
               precision    recall  f1-score   support

    Fake News       0.84      0.88      0.86     45501
Reliable News       0.87      0.83      0.85     44867

     accuracy                           0.86     90368
    macro avg       0.86      0.86      0.86     90368
 weighted avg       0.86      0.86      0.86     90368



## Now we proceed to the test set

In [None]:
#loading the cleaned test_set to memory as df
test_df=pd.read_csv('full_test_cleaned.csv') 

In [16]:
#Defining the x-values and y-values
x_test,y_test=test_df['content'],test_df['broad_category']

In [17]:
#Transforming the test data
x_test = x_test.astype(str).fillna("")
x_test_vectorized = vectorizer.transform(x_test)

In [18]:
#Using the trained model to predict categories for the test data.
y_test_pred=logreg.predict(x_test_vectorized)
print("Test set performance")
print(classification_report(y_test,y_test_pred))

Test set performance
               precision    recall  f1-score   support

    Fake News       0.84      0.88      0.86     45665
Reliable News       0.87      0.83      0.85     44703

     accuracy                           0.85     90368
    macro avg       0.86      0.85      0.85     90368
 weighted avg       0.86      0.85      0.85     90368



# BBC data 


In [None]:
#loading the cleaned val_set to memory as df
BBC_df=pd.read_csv('bbc_cleaned.csv') 

In [20]:
#Defining the x-values and y-values
BBC_x=BBC_df['content'].astype(str).fillna("")
BBC_y=BBC_df['type']

In [21]:
#Defining the x-values and y-values
x_train=train_df['content'].astype(str).fillna("")
y_train=train_df['broad_category']

#Combining x and y values for BBC and training set
x_combined = pd.concat([x_train, BBC_x])
y_combined = pd.concat([y_train, BBC_y])

In [22]:
vectorizer = CountVectorizer(max_features=10000)

#Fitting and transforming the x value 
x_train_combined_vec = vectorizer.fit_transform(x_combined)
x_val_vec = vectorizer.transform(x_val)

In [23]:
#Logistic Regression model with choosen parameters
logregBBC=LogisticRegression(max_iter=10000,C=0.01)

#Fitting the Logistic Regression model on the training set and BBC set
logregBBC.fit(x_train_combined_vec, y_combined)

#Using the trained model to predict categories for the validation data.
y_val_BBC_pred = logregBBC.predict(x_val_vec)
print(classification_report(y_val, y_val_BBC_pred))

               precision    recall  f1-score   support

    Fake News       0.84      0.88      0.86     45501
Reliable News       0.87      0.83      0.85     44867

     accuracy                           0.86     90368
    macro avg       0.86      0.86      0.86     90368
 weighted avg       0.86      0.86      0.86     90368



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#Confusion matrix
cm = confusion_matrix(y_val, y_val_BBC_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[39959  5542]
 [ 7536 37331]]


# LIARDATASET

In [None]:
#loading the cleaned liar set to memory as df
liar_df=pd.read_csv('liar_cleaned.csv')

In [26]:
#Defining the x-values and y-values
x_liar, y_liar = liar_df['Statement'], liar_df['broad_category']

In [27]:
#Checking the distibution of types
liar_df['broad_category'].value_counts()

broad_category
Fake News        6602
Reliable News    3638
Name: count, dtype: int64

In [28]:
#Transforming the x value
x_liar_vectorized = vectorizer.transform(x_liar)

In [29]:
#Using the previous trained model to predict categories for the liar data.
y_liar_pred=logreg.predict(x_liar_vectorized)
print("Test set performance")
print(classification_report(y_liar,y_liar_pred))

Test set performance
               precision    recall  f1-score   support

    Fake News       0.65      0.82      0.72      6602
Reliable News       0.37      0.19      0.25      3638

     accuracy                           0.60     10240
    macro avg       0.51      0.51      0.49     10240
 weighted avg       0.55      0.60      0.56     10240



In [31]:
#Confusion matrix
cm = confusion_matrix(y_liar, y_liar_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[5429 1173]
 [2946  692]]
