In [5]:
import pandas as pd


In [6]:
newsdata = pd.read_csv('news_raw.csv')

In [7]:
newsdata.head(10)

Unnamed: 0,DateTime,news_headline
0,12:32 AM06/03/2020 12:32:41 AM UTC-0400,"BlackRock focuses on China credit, oil and tra..."
1,06/02/2020,UPDATE 1-U.S. shale producers begin restoring ...
2,06/02/2020,UPDATE 1-U.S. sanctions four shipping firms fo...
3,06/02/2020,Solar power to remain 'important and consisten...
4,06/02/2020,Record gold price against Aussie dollar impact...
5,06/02/2020,Contact Gold Receives Permits for Drilling at ...
6,06/02/2020,Oil prices rise ahead of OPEC+ meeting on exte...
7,06/01/2020,U.S. appeals court: Volkswagen may face 'enorm...
8,06/01/2020,Is Kinross Gold (KGC) a Great Value Stock Righ...
9,06/01/2020,Zacks.com featured highlights include: B&G Foo...


In [8]:
# Find dimension of the raw dataset

newsdata.shape

(16844, 2)

### Working with missing values

In [9]:
# Find number of missing values or NaN 

newsdata.isnull().sum()

DateTime         802
news_headline    802
dtype: int64

In [10]:
newsdata[newsdata.DateTime.isnull()].head()

Unnamed: 0,DateTime,news_headline
202,,
210,,
228,,
230,,
235,,


In [11]:
# To verify, we only drop if the row have no values at all

newsdata.dropna(how='all').shape

(16042, 2)

In [12]:
# Drop duplicates

newsdata.drop_duplicates(inplace=True)

In [13]:
# Drop missing values

newsdata = newsdata.dropna(how='all')
newsdata.isnull().sum()

DateTime         0
news_headline    0
dtype: int64

In [14]:
print("\nDataframe dimension:", newsdata.info())
newsdata.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14686 entries, 0 to 16843
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   DateTime       14686 non-null  object
 1   news_headline  14686 non-null  object
dtypes: object(2)
memory usage: 344.2+ KB

Dataframe dimension: None


Unnamed: 0,DateTime,news_headline
0,12:32 AM06/03/2020 12:32:41 AM UTC-0400,"BlackRock focuses on China credit, oil and tra..."
1,06/02/2020,UPDATE 1-U.S. shale producers begin restoring ...
2,06/02/2020,UPDATE 1-U.S. sanctions four shipping firms fo...
3,06/02/2020,Solar power to remain 'important and consisten...
4,06/02/2020,Record gold price against Aussie dollar impact...


In [15]:
newsdata = newsdata.loc[newsdata.DateTime.str.len() <11] # slicing row for long date format
newsdata.head()


Unnamed: 0,DateTime,news_headline
1,06/02/2020,UPDATE 1-U.S. shale producers begin restoring ...
2,06/02/2020,UPDATE 1-U.S. sanctions four shipping firms fo...
3,06/02/2020,Solar power to remain 'important and consisten...
4,06/02/2020,Record gold price against Aussie dollar impact...
5,06/02/2020,Contact Gold Receives Permits for Drilling at ...


In [19]:
# Change Date format to string

newsdata['DateTime'] = pd.to_datetime(newsdata.DateTime)
newsdata['DateTime'] = newsdata['DateTime'].dt.strftime('%d-%m-%Y')
newsdata.rename(columns={'DateTime':'Date'}, inplace=True)

In [20]:
# Change DemCluster from integer to nominal/str

newsdata['news_headline'] = newsdata['news_headline'].astype(str)

In [21]:
print(newsdata.info())
newsdata.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14684 entries, 1 to 16843
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Date           14684 non-null  object
 1   news_headline  14684 non-null  object
dtypes: object(2)
memory usage: 344.2+ KB
None


Unnamed: 0,Date,news_headline
1,02-06-2020,UPDATE 1-U.S. shale producers begin restoring ...
2,02-06-2020,UPDATE 1-U.S. sanctions four shipping firms fo...
3,02-06-2020,Solar power to remain 'important and consisten...
4,02-06-2020,Record gold price against Aussie dollar impact...
5,02-06-2020,Contact Gold Receives Permits for Drilling at ...


In [22]:
# Export data as csv file

newsdata.to_csv('/Volumes/Lainy 1T WD/WQD7005 Data Mining /Assignment /M3 - Accessing Hive /Preprocessing/news_preprocessed.csv', 
               index=False)