## Nurullainy binti Mat Rashid

### Milestone 4: Data Interpretation & Communication of Insights of data  

### News Data Analysis - Data Preprocessing

In [1]:
import pandas as pd
import re
from textblob import TextBlob

In [2]:
# Load Dataset

newsdata = pd.read_csv('news_raw.csv')

In [3]:
newsdata.head(10)

Unnamed: 0,DateTime,news_headline
0,12:32 AM06/03/2020 12:32:41 AM UTC-0400,"BlackRock focuses on China credit, oil and tra..."
1,06/02/2020,UPDATE 1-U.S. shale producers begin restoring ...
2,06/02/2020,UPDATE 1-U.S. sanctions four shipping firms fo...
3,06/02/2020,Solar power to remain 'important and consisten...
4,06/02/2020,Record gold price against Aussie dollar impact...
5,06/02/2020,Contact Gold Receives Permits for Drilling at ...
6,06/02/2020,Oil prices rise ahead of OPEC+ meeting on exte...
7,06/01/2020,U.S. appeals court: Volkswagen may face 'enorm...
8,06/01/2020,Is Kinross Gold (KGC) a Great Value Stock Righ...
9,06/01/2020,Zacks.com featured highlights include: B&G Foo...


In [4]:
# Find dimension of the raw dataset

newsdata.shape

(16844, 2)

The dataset contains about 17 thousands rows and 2 columns

### Working with missing values & duplicate data

In [5]:
# Find number of missing values or NaN 

newsdata.isnull().sum()

DateTime         802
news_headline    802
dtype: int64

In [6]:
# To verify, we only drop if the row have no values at all

newsdata.dropna(how='all').shape

(16042, 2)

In [7]:
# Drop duplicates

newsdata.drop_duplicates(inplace=True)

In [8]:
# Drop missing values

newsdata = newsdata.dropna(how='all')
newsdata.isnull().sum()

DateTime         0
news_headline    0
dtype: int64

### Change data type

In [10]:
# slicing row for long date format

newsdata = newsdata.loc[newsdata.DateTime.str.len() <11] 

In [11]:
# Change Date format to string

newsdata['DateTime'] = pd.to_datetime(newsdata.DateTime)
newsdata['DateTime'] = newsdata['DateTime'].dt.strftime('%d-%m-%Y')
newsdata.rename(columns={'DateTime':'Date'}, inplace=True)

In [12]:
# Change news headline from integer to nominal/str

newsdata['news_headline'] = newsdata['news_headline'].astype(str)

In [13]:
print(newsdata.info())
newsdata.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14684 entries, 1 to 16843
Data columns (total 2 columns):
Date             14684 non-null object
news_headline    14684 non-null object
dtypes: object(2)
memory usage: 344.2+ KB
None


Unnamed: 0,Date,news_headline
1,02-06-2020,UPDATE 1-U.S. shale producers begin restoring ...
2,02-06-2020,UPDATE 1-U.S. sanctions four shipping firms fo...
3,02-06-2020,Solar power to remain 'important and consisten...
4,02-06-2020,Record gold price against Aussie dollar impact...
5,02-06-2020,Contact Gold Receives Permits for Drilling at ...
6,02-06-2020,Oil prices rise ahead of OPEC+ meeting on exte...
7,01-06-2020,U.S. appeals court: Volkswagen may face 'enorm...
8,01-06-2020,Is Kinross Gold (KGC) a Great Value Stock Righ...
9,01-06-2020,Zacks.com featured highlights include: B&G Foo...
10,01-06-2020,Oil steady as OPEC+ considers extension to out...


In [14]:
# Export data as csv file

newsdata.to_csv('/Volumes/Lainy 1T WD/WQD7005 Data Mining /Assignment /M3 - Accessing Hive /Preprocessing/news_preprocessed.csv', 
               index=False)

## Finding News Polarity

In [15]:
newsdata = pd.read_csv('news_preprocessed.csv')

In [16]:
# Filter down to only news about crude oil

for i in range(0,len(newsdata)):
    if re.search(r'oil', newsdata['news_headline'][i],re.I) is not None:
        continue
    else:
        newsdata.drop(i, inplace=True)

In [17]:
newsdata.shape

(7710, 2)

From 16,844 of news, 7,710 are news that relate to oil 

In [18]:
# Resetting index

newsdata.sort_values(by=['Date'],inplace=True)
newsdata = newsdata.reset_index(drop=True)

In [19]:
# Derive sentiments in terms of polarity and subjectivity using textblob

newsdata['polarity'] = ''
newsdata['subjectivity'] = ''
newsdata['polarity_description'] = ''

for i in range(0,len(newsdata)):
    blob = TextBlob(newsdata['news_headline'][i])
    Sentiment = blob.sentiment
    newsdata['polarity'][i] = Sentiment.polarity
    newsdata['subjectivity'][i] = Sentiment.subjectivity
    
    if Sentiment.polarity > 0:
        newsdata['polarity_description'][i] = 'Positive'
    elif Sentiment.polarity < 0:
        newsdata['polarity_description'][i] = 'Negative'
    else:
        newsdata['polarity_description'][i] = 'Neutral'
        
newsdata['polarity'] = pd.to_numeric(newsdata['polarity'])
newsdata['Date'] = pd.to_datetime(newsdata['Date'])

In [20]:
newsdata.head(10)

Unnamed: 0,Date,news_headline,polarity,subjectivity,polarity_description
0,2020-01-01,Johan Sverdrup: Norway’s big bet on a rosy fut...,0.0,0.1125,Neutral
1,2020-01-02,India exempts very low sulphur fuel oil from i...,0.0,0.39,Neutral
2,2020-01-02,Oil majors request more Mozambique troops afte...,0.5,0.5,Positive
3,2020-01-02,India raises import tax on crude palm oil to 44%,-0.7,1.0,Negative
4,2020-01-03,Russia's Putin says current oil prices acceptable,0.0,0.4,Neutral
5,2020-01-03,Why Is Imperial Oil (IMO) Down 10.2% Since Las...,-0.077778,0.177778,Negative
6,2020-01-03,"Putin, ahead of OPEC meeting, says current oil...",0.0,0.4,Neutral
7,2020-01-03,OPEC could deepen oil supply cuts with or with...,0.0,0.0,Neutral
8,2020-01-03,Speculators raise their bets against US oil an...,0.0,0.0,Neutral
9,2020-01-03,Oil bounces from multi-year lows as hopes of O...,0.0,0.0,Neutral


In [21]:
newsdata.to_csv('news_interpretation.csv',index=False)

In [22]:
polarity_mean = newsdata.groupby('Date', as_index=False)['polarity'].mean()
polarity_mean.rename(columns={'polarity':'polarity_mean'},inplace=True)
polarity_mean.to_csv('polarity_mean.csv',index=False)