# Imports

In [None]:
# pip install sentence_transformers

In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

from random import randint

In [None]:
from sentence_transformers import SentenceTransformer
m1 = SentenceTransformer('all-MiniLM-L6-v2') # embedding model

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
map_2 = ['Fake', 'True']
map_3 = ['True', 'Fake with True Text', 'Fake with False Text']
map_6 = ['True', 'Satire', 'Misleading Content', 'Manipulated Content', 'False Connection', 'Imposter Cotent']

map_6_true = ['True', 'Satire', 'Misleading Content', 'Manipulated Content', 'Imposter Cotent']

# **Config**

In [None]:
max_nlp_features = 1000 # for count_vectorize
selected_nlp_processing = 0 # 0: count_vectorize, 1: LM-L6-v2 embeddings

label_type = '2_way_label' # 2_way_label, 3_way_label, or 6_way_label

selected_model = Pipeline([
    ('ss', StandardScaler()), ('lr', RandomForestClassifier(n_estimators=250, max_depth=5, class_weight='balanced'))
]) 

# Logistic Regression | Support Vector Classifier | Decision Tree | Random Forest Classifier | GradientBoostingClassifier | RidgeClassifier

# Data Loading (#1)

In [None]:
!wget -q --show-progress "https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20%2B%20X/Group/PolySci%20%26%20Law/Fakeddit/images.npy"
!wget -q --show-progress "https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20%2B%20X/Group/PolySci%20%26%20Law/Fakeddit/labels.csv"
image_data = np.load('images.npy',allow_pickle=True)
labels = pd.read_csv('labels.csv')



# Data Cleaning (#1)

In [None]:
labels = labels.drop(columns=['author', 'domain', 'subreddit', 'Unnamed: 0', 'title'])

In [None]:
labels.isnull().sum()

clean_title      56
created_utc       0
num_comments    199
score             0
upvote_ratio    199
2_way_label       0
3_way_label       0
6_way_label       0
dtype: int64

In [None]:
labels = labels.dropna()

In [None]:
labels = labels.reset_index(drop=True)

In [None]:
print('Preset Data Value Counts:\n')
print(labels['2_way_label'].value_counts(normalize=True))
print(labels['3_way_label'].value_counts(normalize=True))
print(labels['6_way_label'].value_counts(normalize=True))

Preset Data Value Counts:

1    0.586957
0    0.413043
Name: 2_way_label, dtype: float64
0    0.586957
2    0.375776
1    0.037267
Name: 3_way_label, dtype: float64
0    0.586957
2    0.257764
1    0.077640
5    0.068323
3    0.009317
Name: 6_way_label, dtype: float64


# Data Loading (#2)

In [None]:
all_train = pd.read_csv('/content/drive/MyDrive/Data Science/Fakeddit/all_train.tsv', sep='\t')

# Data Cleaning (#2)

In [None]:
all_train.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'author', 'clean_title', 'created_utc', 'domain', 'hasImage', 'id',
       'image_url', 'linked_submission_id', 'num_comments', 'score',
       'subreddit', 'title', 'upvote_ratio', '2_way_label', '3_way_label',
       '6_way_label'],
      dtype='object')

In [None]:
all_train = all_train.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'id', 'linked_submission_id', 'hasImage', 'image_url', 'author', 'subreddit', 'domain', 'title'])
full_data = all_train.dropna()

In [None]:
full_data = full_data.reset_index(drop=True)

In [None]:
full_data.shape

(634932, 8)

In [None]:
full_data.head()

Unnamed: 0,clean_title,created_utc,num_comments,score,upvote_ratio,2_way_label,3_way_label,6_way_label
0,my walgreens offbrand mucinex was engraved wit...,1551641000.0,2.0,12,0.84,1,0,0
1,this concerned sink with a tiny hat,1534727000.0,2.0,119,0.99,0,2,2
2,hackers leak emails from uae ambassador to us,1496511000.0,1.0,44,0.92,1,0,0
3,this flower in my neighborhood,1557764000.0,0.0,17,0.92,1,0,0
4,puppy taking in the view,1471341000.0,26.0,250,0.95,1,0,0


In [None]:
print('Full Data Value Counts:\n')
print(full_data['2_way_label'].value_counts(normalize=True))
print(full_data['3_way_label'].value_counts(normalize=True))
print(full_data['6_way_label'].value_counts(normalize=True))

Full Data Value Counts:

1    0.63042
0    0.36958
Name: 2_way_label, dtype: float64
0    0.630420
2    0.343881
1    0.025699
Name: 3_way_label, dtype: float64
0    0.630420
2    0.223591
1    0.066637
5    0.041849
3    0.037503
Name: 6_way_label, dtype: float64


In [None]:
sample = full_data.sample(n=2000)

In [None]:
print('Sample Label Value Counts:\n')
print(sample['2_way_label'].value_counts(normalize=True))
print(sample['3_way_label'].value_counts(normalize=True))
print(sample['6_way_label'].value_counts(normalize=True))

Sample Label Value Counts:

1    0.664
0    0.336
Name: 2_way_label, dtype: float64
0    0.664
2    0.318
1    0.018
Name: 3_way_label, dtype: float64
0    0.6640
2    0.2160
1    0.0595
5    0.0345
3    0.0260
Name: 6_way_label, dtype: float64


In [None]:
main_data = sample.copy()

# Features

🔴 **Select Dataset Type**

In [None]:
dataset = main_data.copy() # either main_data (2000) or labels (322)

In [None]:
dataset = dataset.reset_index(drop=True)

In [None]:
dataset.shape

(2000, 8)

In [None]:
dataset.head()

Unnamed: 0,clean_title,created_utc,num_comments,score,upvote_ratio,2_way_label,3_way_label,6_way_label
0,found a bottle with two caps,1557625000.0,2.0,27,0.89,1,0,0
1,foci foreplay,1528052000.0,3.0,12,1.0,0,2,1
2,a typical christian showing his love to a fell...,1569181000.0,1.0,17,0.75,0,2,2
3,a man saves a dog after he finds it starving o...,1564480000.0,1.0,5,1.0,1,0,0
4,yearold jaylee monteith calls and saves babysi...,1400091000.0,0.0,2,1.0,1,0,0


In [None]:
dataset['6_way_label'] = dataset['6_way_label'].replace([5],4)

In [None]:
print('Final Dataset Value Counts:\n')
print(dataset['2_way_label'].value_counts(normalize=True))
print(dataset['3_way_label'].value_counts(normalize=True))
print(dataset['6_way_label'].value_counts(normalize=True))

Final Dataset Value Counts:

1    0.664
0    0.336
Name: 2_way_label, dtype: float64
0    0.664
2    0.318
1    0.018
Name: 3_way_label, dtype: float64
0    0.6640
2    0.2160
1    0.0595
4    0.0345
3    0.0260
Name: 6_way_label, dtype: float64


In [None]:
title_texts = dataset['clean_title']

0                         found a bottle with two caps
1                                        foci foreplay
2    a typical christian showing his love to a fell...
3    a man saves a dog after he finds it starving o...
4    yearold jaylee monteith calls and saves babysi...
Name: clean_title, dtype: object

In [None]:
for i in range(5):
  index_val = randint(0,2000)
  print('Title:', title_texts[index_val])
  print('2 way Label: ', map_2[dataset['2_way_label'][index_val]])
  print('3 way Label: ', map_3[dataset['3_way_label'][index_val]])
  print('6 way Label: ', map_6_true[dataset['6_way_label'][index_val]])
  print('-'*50)

Title: this guy midsneeze
2 way Label:  True
3 way Label:  True
6 way Label:  True
--------------------------------------------------
Title: i have rested my head on the same spot on the wall every day long enough to wear down the paint
2 way Label:  True
3 way Label:  True
6 way Label:  True
--------------------------------------------------
Title: volkswagen has its logo on headlights
2 way Label:  True
3 way Label:  True
6 way Label:  True
--------------------------------------------------
Title: swiss city authorities bans silent disco events due to noise concerns
2 way Label:  True
3 way Label:  True
6 way Label:  True
--------------------------------------------------
Title: mexican antinazi ww poster s
2 way Label:  Fake
3 way Label:  Fake with True Text
6 way Label:  Imposter Cotent
--------------------------------------------------


In [None]:
def process_lang_data(text):
  '''Tokenize the text, remove stopwards, go through lemmatization, and remove punctuation'''
  cleaned_text = []
  punctuation = string.punctuation
  our_stopwords = stopwords.words('english')
  lemmatizer = WordNetLemmatizer()

  for token in word_tokenize(text):
    if token not in punctuation and token not in our_stopwords:
      clipped_token = lemmatizer.lemmatize(token)
      cleaned_text.append(clipped_token)

  return cleaned_text

In [None]:
bow = CountVectorizer(analyzer=process_lang_data, max_features=max_nlp_features) # 1000 to 250

bow.fit(title_texts) # fitting to  data

bow_transformed = bow.transform(title_texts).toarray() # then transforming

In [None]:
vocab = bow.vocabulary_
vocab_data = pd.DataFrame({
    'word': vocab.keys(),
    'value': vocab.values()
})

In [None]:
vect_df = pd.DataFrame(bow_transformed, columns=list(range(1000)) )
vect_df.shape

----

In [None]:
embeddings = m1.encode(title_texts)
embeddings_df = pd.DataFrame(embeddings, columns=list(range(384)) )

(2000, 384)

*Final Processed Data*

In [None]:
vect_type = [vect_df, embeddings_df]
combined_dataset = pd.concat([vect_type[selected_nlp_processing], dataset.drop(columns=['clean_title'])], axis=1)

In [None]:
combined_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,997,998,999,created_utc,num_comments,score,upvote_ratio,2_way_label,3_way_label,6_way_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1557625000.0,2.0,27,0.89,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1528052000.0,3.0,12,1.0,0,2,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1569181000.0,1.0,17,0.75,0,2,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1564480000.0,1.0,5,1.0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1400091000.0,0.0,2,1.0,1,0,0


# Splitting Data

In [None]:
X = combined_dataset.drop(columns=['2_way_label','3_way_label','6_way_label'])
X_train, X_test, y_train, y_test = train_test_split(X, combined_dataset[label_type], test_size=0.25, random_state=42)

print('Label: ', label_type)
print('Dimensions of training and testing data: \n')

print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')

print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

Label:  2_way_label
Dimensions of training and testing data: 

X_train: (1500, 1004)
y_train: (1500,)
X_test: (500, 1004)
y_test: (500,)


# Training Models

In [None]:
print(selected_model)
selected_model.fit(X_train, y_train)

Pipeline(steps=[('ss', StandardScaler()),
                ('lr',
                 RandomForestClassifier(class_weight='balanced', max_depth=5,
                                        n_estimators=250))])




Pipeline(steps=[('ss', StandardScaler()),
                ('lr',
                 RandomForestClassifier(class_weight='balanced', max_depth=5,
                                        n_estimators=250))])

# Evaluating Models

In [None]:
y_pred = selected_model.predict(X_test)

print('Model:', selected_model)
print('Label Type:', label_type)
print('Acc:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred), '\n')

print(confusion_matrix(y_test, y_pred))

Model: Pipeline(steps=[('ss', StandardScaler()),
                ('lr',
                 RandomForestClassifier(class_weight='balanced', max_depth=5,
                                        n_estimators=250))])
Label Type: 2_way_label
Acc: 0.66
Precision: 0.7766323024054983
Recall: 0.6827794561933535 

[[104  65]
 [105 226]]




In [None]:
y_pred_train = selected_model.predict(X_train)

print('Model:', selected_model)
print('Label Type:', label_type)
print('Acc:', accuracy_score(y_train, y_pred_train))
print('Precision:', precision_score(y_train, y_pred_train))
print('Recall:', recall_score(y_train, y_pred_train), '\n')

print(confusion_matrix(y_train, y_pred_train))

Model: Pipeline(steps=[('ss', StandardScaler()),
                ('lr',
                 RandomForestClassifier(class_weight='balanced', max_depth=5,
                                        n_estimators=250))])
Label Type: 2_way_label
Acc: 0.754
Precision: 0.852017937219731
Recall: 0.7622868605817452 

[[371 132]
 [237 760]]




# Results

In [None]:
# All with 1000 max_features (count_vectorize) and 2k dataset

lr_results = {'2_way': 0.668, '3_way': 0.668, '6_way': 0.668}

svc_results = {'2_way': 0.668, '3_way': 0.668, '6_way': 0.668}

dt_results = {'2_way': 0.724, '3_way': 0.69, '6_way': 0.638}

rf_results = {'2_way': 0.78, '3_way': 0.76, '6_way': 0.726}

gb_results = {'2_way': 0.784, '3_way': 0.772, '6_way': 0.754}

rc_results = {'2_way': 0.63, '3_way': 0.618, '6_way': 0.604}