In [1]:
import pandas as pd

In [2]:
#importing the dataframe
news_df = pd.read_csv("../data/News.csv")

In [3]:
news_df.head()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,SentimentTitle,SentimentHeadline,Facebook,GooglePlus,LinkedIn
0,99248.0,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,4/2/2002 0:00,0.0,-0.0533,-1,-1,-1
1,10423.0,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,9/20/2008 0:00,0.208333,-0.156386,-1,-1,-1
2,18828.0,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,1/28/2012 0:00,-0.42521,0.139754,-1,-1,-1
3,27788.0,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,3/1/2015 0:06,0.0,0.026064,-1,-1,-1
4,27789.0,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,3/1/2015 0:11,0.0,0.141084,-1,-1,-1


In [4]:
news_df.shape

(93239, 11)

### About the data

The given dataset contains a large number of News Article Headlines mapped together with its
Sentiment Score and their respective social feedback on multiple platforms. The collected data accounts 
about 93239 news items on four different topics: Economy, Microsoft, Obama and Palestine. (UCI 
Machine Learning Repository, n.d.)

The attributes present in the dataset are:
- **IDLink (numeric):** Unique identifier of news items
- **Title (string):** Title of the news item according to the official media sources
- **Headline (string):** Headline of the news item according to the official media sources
- **Source (string):** Original news outlet that published the news item
- **Topic (string):** Query topic used to obtain the items in the official media sources
- **PublishDate (timestamp):** Date and time of the news items' publication
- **SentimentTitle (numeric):** Sentiment score of the text in the news items' title
- **SentimentHeadline (numeric):** Sentiment score of the text in the news items' headline
- **Facebook (numeric):** Final value of the news items' popularity according to the social media 
source Facebook
- **GooglePlus (numeric):** Final value of the news items' popularity according to the social media 
source Google+
- **LinkedIn (numeric):** Final value of the news items' popularity according to the social media 
source LinkedIn

For this project the Title and SentimentTitle attributes will only be used and news related to Microsoft will be removed as it is more tech centric and it is quite irrelevant in the context of Nepal. 

In [5]:
# Data with neutral sentiment
news_df = news_df[news_df['SentimentTitle'] != 0]

In [6]:
# Data with positive sentiment
news_df[news_df['SentimentTitle'] > 0].shape

(36589, 11)

In [7]:
# Data with negative sentiment
news_df[news_df['SentimentTitle'] < 0].shape

(37938, 11)

It seems like there is almost thrice more negative news(while considering neural news as negative) than postive news.

### Data Preprocessing

In [8]:
#Dropping news related to microsoft
news_df = news_df[news_df['Topic'] != "microsoft"]

In [9]:
#Removing the irreleant columns
news_df = news_df[['Title', 'SentimentTitle']]

In [10]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56960 entries, 1 to 93237
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Title           56960 non-null  object 
 1   SentimentTitle  56960 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.3+ MB


In [11]:
# In general sentiment score above 0.05 are considered positive 
# And since we are only interested in filtering good news or positive news
# We will label score above 0.05 as postive and any score below it as negative
def is_positive(sentiment_score):
    if sentiment_score > 0:
        return 1
    else:
        return 0

In [12]:
news_df['Is_SentimentTitle_Positive'] = news_df['SentimentTitle'].apply(is_positive)

In [13]:
# Removing SentimentHeadline column
news_df = news_df[['Title','Is_SentimentTitle_Positive']]

In [14]:
news_df.head()

Unnamed: 0,Title,Is_SentimentTitle_Positive
1,A Look at the Health of the Chinese Economy,1
2,Nouriel Roubini: Global Economy Not Back to 2008,0
7,"Obama, stars pay a musical tribute to Ray Charles",1
8,Fire claims more than 100-year-old barn in Han...,0
13,Big data and the Internet of Things to add £32...,1


### Text Preprocessing

In [15]:
# Removing Punctuations and converting all word to lowercase
import string

def remove_punctuation(text):
    no_punctuation_text = ''.join([i for i in str(text) if i not in string.punctuation])
    return no_punctuation_text.lower()

In [16]:
news_df['Title'] = news_df['Title'].apply(remove_punctuation)

In [17]:
news_df.head()

Unnamed: 0,Title,Is_SentimentTitle_Positive
1,a look at the health of the chinese economy,1
2,nouriel roubini global economy not back to 2008,0
7,obama stars pay a musical tribute to ray charles,1
8,fire claims more than 100yearold barn in hanco...,0
13,big data and the internet of things to add £32...,1


In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [19]:
import re 

def remove_nonwords(str_):
    return re.sub("[^A-Za-z ]\w+[^A-Za-z]*", ' ', str_)

# Lemmatization and Removing stop words and non words
def text_preprocessing(text):
    text = remove_nonwords(text)
    tokenized_text = [token.lemma_ for token in nlp(text)]
    no_stopwords_list = [i.lower() for i in tokenized_text if i not in nlp.Defaults.stop_words]
    lemma_text = ' '.join(no_stopwords_list)
    return lemma_text


In [20]:
# Preprocessing the title text
news_df['Title'] = news_df['Title'].apply(text_preprocessing)

In [21]:
news_df.head()

Unnamed: 0,Title,Is_SentimentTitle_Positive
1,look health chinese economy,1
2,nouriel roubini global economy,0
7,obama star pay musical tribute ray charle,1
8,fire claim barn hancock county,0
13,big datum internet thing add uk economy re...,1


In [22]:
# Removing all Null
news_df = news_df[news_df['Title'].notnull()]

In [23]:
# Dropping all Nan
news_df = news_df.dropna()

In [24]:
# dropping ALL duplicte values
news_df.drop_duplicates(subset ="Title",
                     keep = False, inplace = True)

In [25]:
news_df.to_csv("../Data/Clean_data.csv", index=False)