In [44]:

import pandas as pd
import numpy as np
import warnings
import os

import nltk


from nltk.stem import WordNetLemmatizer # to lemmatize the words
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet # to get the wordnet pos tags
from nltk.corpus import stopwords # to remove the stopwords
from sklearn.feature_extraction.text import CountVectorizer # to create a bag of words

# Machine Learning
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.metrics import silhouette_score

<h1 style="color:blue;">Data Extraction</h1>

In [45]:
current_path = os.getcwd()
data_path = os.path.join(current_path, 'dataset')
data_csv = os.path.join(data_path, 'training_data.csv')

In [46]:
data = pd.read_csv(data_csv, sep='\t')

In [47]:
df = data.copy()

<h1 style="color:blue;">Data Cleaning</h1>

In [48]:
# snake_columns
def snake_columns(df):
	if any(column.isupper() for column in df.columns):
		df.columns = [column.strip().lower().replace(' ', '_').replace('-', '_') for column in df.columns]
	return df

In [49]:
df.rename(columns={df.columns[0]: 'label', df.columns[1]: 'headline'}, inplace=True)
print(df.head())

   label                                           headline
0      0  drunk bragging trump staffer started russian c...
1      0  sheriff david clarke becomes an internet joke ...
2      0  trump is so obsessed he even has obama‚s name ...
3      0  pope francis just called out donald trump duri...
4      0  racist alabama cops brutalize black boy while ...


In [50]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.shape


(32205, 2)

In [51]:
max_len = df['headline'].str.len().max()
min_len = df["headline"].str.len().min()

print(f"Max length: {max_len} characters")
print(f"Min length: {min_len} characters")

Max length: 279 characters
Min length: 2 characters


<h1 style="color:blue;">Data Preprocessing</h1>

In [54]:
# remove punctuation and do tokenization
def clean_text(text):
    text = text.lower()
    text = word_tokenize(text)
    text = [word for word in text if word.isalpha()]
    return text

In [55]:
# remove stopwords
nltk.download('stopwords')

df['headline'] = df['headline'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))
df.head()



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/selinwork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,headline
0,0,drunk bragging trump staffer started russian c...
1,0,sheriff david clarke becomes internet joke thr...
2,0,trump obsessed even obama‚s name coded website...
3,0,pope francis called donald trump christmas speech
4,0,racist alabama cops brutalize black boy handcu...


In [56]:
'''Display a few original headlines and their processed versions'''


original_headlines = data.head(10)
processed_headlines = df.head(10)

comparison = pd.DataFrame({
    'Original': original_headlines.iloc[:, 0],
    'Processed': processed_headlines['headline']
})

print(comparison)

   Original                                          Processed
0         0  drunk bragging trump staffer started russian c...
1         0  sheriff david clarke becomes internet joke thr...
2         0  trump obsessed even obama‚s name coded website...
3         0  pope francis called donald trump christmas speech
4         0  racist alabama cops brutalize black boy handcu...
5         0                                  fresh golf course
6         0  trump said insanely racist stuff inside oval o...
7         0        former cia director slams trump un bullying
8         0  brand-new pro-trump ad features much a** kissi...
9         0                        papa john‚s founder retires


<h1 style="color:blue;">Sentiment Analysis with VADER</h1>

1. I checked the sentiment analysis of different groups like fake headline data and real headline data.

2. I combined two of them to understand the sentiment behaviours of the sentences

3. My conclusion from this part is fake headlines has less negative scores than real ones usually they subtly provoke the emotional responses. But rela headlines are giving the factual events without masking them.

4. There was no positivity as expected the nature of the news which they should be more neutral but not positive.



In [57]:
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer
vd = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/selinwork/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [68]:
df0 = df[df['label'] == 0]
df1 = df[df['label'] == 1]

In [69]:
df0['sentiment'] = df0['headline'].apply(lambda x: vd.polarity_scores(x)['compound'])
df1['sentiment'] = df1['headline'].apply(lambda x: vd.polarity_scores(x)['compound'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0['sentiment'] = df0['headline'].apply(lambda x: vd.polarity_scores(x)['compound'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['sentiment'] = df1['headline'].apply(lambda x: vd.polarity_scores(x)['compound'])


In [86]:
# positive sentiment, negative sentiment, neutral sentiment
map = {1: 'positive', 0: 'neutral', -1: 'negative'}
df0['sentiment'] = df0['sentiment'].apply(lambda x: 1 if x >= 0.05 else (0 if x >= -0.05 else -1))
df1['sentiment'] = df1['sentiment'].apply(lambda x: 1 if x >= 0.05 else (0 if x >= -0.05 else -1))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0['sentiment'] = df0['sentiment'].apply(lambda x: 1 if x >= 0.05 else (0 if x >= -0.05 else -1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['sentiment'] = df1['sentiment'].apply(lambda x: 1 if x >= 0.05 else (0 if x >= -0.05 else -1))


In [85]:
polarity = pd.DataFrame({'label': [0, 1], 'sentiment': [df0['sentiment'].mean(), df1['sentiment'].mean()]})
polarity

Unnamed: 0,label,sentiment
0,0,-0.235147
1,1,-0.07583


In [77]:
df_combined = pd.concat([df0, df1], axis=0)
df_combined.sample(10)

Unnamed: 0,label,headline,sentiment
20049,1,'what happened': clinton memoir looks 2016 ele...,0.0
11030,0,yea baby! trump reverses obama pro-abortion ‚p...,0.0
6534,0,white vote can‚t won‚t save trump ‚ republican...,0.0
29749,1,eu gives june deadline deal refugees,0.0
4927,0,gop senator says trump doesn‚t deserve win,0.5859
30976,1,chile's leftists stop short endorsing presiden...,-0.296
6156,0,unfortunate bug lands trump‚s hair rally,-0.4588
3501,0,prominent holocaust attorney files massive law...,0.1027
31452,1,argentina's macri expects missing sub found co...,-0.296
704,0,trump whines obamacare takes credit obama‚s ec...,-0.3818


<h1 style="color:blue;">Displaying some negative words / some playing </h1>

In [79]:
from collections import Counter

# Tokenize the text
nltk.download('punkt')
nltk.download('punkt_tab')

df0['tokens'] = df0['headline'].apply(clean_text)

# Define a function to filter negative words
def get_negative_words(tokens):
    negative_words = [word for word in tokens if vd.polarity_scores(word)['compound'] < 0]
    return negative_words

# Apply the function to get negative words
df0['negative_words'] = df0['tokens'].apply(get_negative_words)

# Flatten the list of negative words and count their frequency
all_negative_words = [word for sublist in df0['negative_words'] for word in sublist]
negative_word_freq = Counter(all_negative_words)

# Display the most common negative words
print(negative_word_freq.most_common(20))

[nltk_data] Downloading package punkt to /Users/selinwork/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/selinwork/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0['tokens'] = df0['headline'].apply(clean_text)


[('racist', 285), ('gun', 200), ('attack', 190), ('war', 157), ('fake', 148), ('illegal', 140), ('shocking', 129), ('destroys', 122), ('lies', 121), ('stop', 121), ('ban', 116), ('attacks', 115), ('protesters', 108), ('hate', 107), ('threatens', 95), ('death', 92), ('terrorist', 89), ('arrested', 86), ('disgusting', 85), ('brutal', 84)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df0['negative_words'] = df0['tokens'].apply(get_negative_words)
