# Import NLTK, TensorFlow, Keras Library

In [44]:
import re
import csv
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Define Data Cleaning Function

In [2]:
def load_csv(filename):
    features = []
    labels = []
    count = 1
    blank_data_number_list = []
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        for line in reader:
            # print(line[1:3])
            if line[1] == 'title':
                count += 1
                continue
            elif line[1] == ' ' or line[2] == ' ':
                blank_data_number_list.append(count)
                count += 1
                continue
            features.append(line[1:3])
            labels.append(line[3])
            count += 1
    print("Loaded csv file, and there are " + str(len(blank_data_number_list)) + " blank text have been removed "
                                                                                 "from dataset\n" + str(len(features)) +
          " data points in total")
    return features, labels

In [4]:
def remove_url(features):
    count = 0
    for i in features:
        if 'http://' in i[1] or 'https://' in i[1]:
            count += 1
            i[1] = re.sub(r'http\S+', ' ', i[1])
    print("There are "+str(count)+" url have been removed from text")

In [6]:
def remove_newline(features):
    for i in features:
        i[1] = i[1].replace('\n', ' ').replace('\r', ' ').replace('\n\n', ' ')
    print("Newline symbols have been removed from text")

In [8]:
def remove_number(features):
    for i in features:
        i[1] = re.sub(r'\d+', ' ', i[1])
    print("Numbers have been removed from text")

In [10]:
def remove_punctuation(features):
    for i in features:
        i[1] = re.sub('[^a-zA-Z]', ' ', i[1])
    print("Punctuations have been removed from text")

In [12]:
def convert_into_lowercase(features):
    for i in features:
        i[1] = i[1].lower()
    print("All text have been converted into lowercase")

In [14]:
def tokenization(features):
    for i in features:
        i[1] = word_tokenize(i[1])
    print("Preformed tokenization")

In [16]:
def remove_stopwords(features):
    stop_words = set(stopwords.words('english'))
    for i in features:
        i[1] = [words for words in i[1] if not words in stop_words]
    print("Stopwords have been removed")

In [18]:
def normalization(features):
    stemmer = PorterStemmer()
    lemma = WordNetLemmatizer()
    for i in features:
        i[1] = [stemmer.stem(word) for word in i[1]]
    for i in features:
        i[1] = [lemma.lemmatize(word=word, pos='v') for word in i[1]]
    print("Text has been normalized")

In [20]:
def remove_short_words(features):
    for i in features:
        i[1] = [word for word in i[1] if len(word) > 2]
    print("Short words have been removed")

# Clean Data

In [3]:
features_data, labels_data = load_csv("./news.csv")

Loaded csv file, and there are 36 blank text have been removed from dataset
6299 data points in total


In [5]:
remove_url(features_data)

There are 295 url have been removed from text


In [7]:
remove_newline(features_data)

Newline symbols have been removed from text


In [9]:
remove_number(features_data)

Numbers have been removed from text


In [11]:
remove_punctuation(features_data)

Punctuations have been removed from text


In [13]:
convert_into_lowercase(features_data)

All text have been converted into lowercase


In [15]:
tokenization(features_data)

Preformed tokenization


In [17]:
remove_stopwords(features_data)

Stopwords have been removed


In [19]:
normalization(features_data)

Text has been normalized


In [21]:
remove_short_words(features_data)

Short words have been removed


# Drop News Titile

In [None]:
def extract_text(features_data):
    extracted = []
    for i in features_data:
        extracted.append(i[1])
    return extracted

In [None]:
features_data = extract_text(features_data)

# Unique Numbers Vectorize

In [None]:
def vectorize(extracted):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(extracted)
    extracted = tokenizer.texts_to_sequences(extracted)
    return extracted

In [None]:
features_data = vectorize(features_data)

# Padding

In [None]:
import matplotlib.pyplot as plt
plt.hist([len(x) for x in extracted], bins=500)
plt.show()

In [None]:
nos = np.array([len(x) for x in extracted])
len(nos[nos  < 1000])

In [None]:
maxlen = 1000
features_data = pad_sequences(features_data, maxlen=maxlen)

# Word2Vec

In [34]:
features_data[0][1].shape

(1, 1000)

In [40]:
type(labels_data)

list

In [54]:
labels_data = np.array(labels_data)
labels_data = LabelEncoder().fit_transform(labels_data)

In [35]:
def split_data(features, labels, test_proportion):
    index = int(len(features) * (1 - test_proportion))
    train_x, train_y = np.array(features[:index],dtype=object), np.array(labels[:index],dtype=object)
    test_x, test_y = np.array(features[index:],dtype=object), np.array(labels[index:],dtype=object)
    return (train_x, train_y), (test_x, test_y)

In [55]:
(x_train, y_train), (x_test, y_test) = split_data(features_data, labels_data, 0.2)

In [56]:
print(x_train[0][1].shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
type(x_train)

(1, 1000)
(5039,)
(1260, 2)
(1260,)


numpy.ndarray

In [57]:
y_train

array([0, 0, 1, ..., 0, 0, 1], dtype=object)