# Import NLTK, TensorFlow, Keras Library

In [1]:
import re
import csv
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Define Data Cleaning Function

In [2]:
def load_csv(filename):
    features = []
    labels = []
    count = 1
    blank_data_number_list = []
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        for line in reader:
            # print(line[1:3])
            if line[1] == 'title':
                count += 1
                continue
            elif line[1] == ' ' or line[2] == ' ':
                blank_data_number_list.append(count)
                count += 1
                continue
            features.append(line[1:3])
            labels.append(line[3])
            count += 1
    print("Loaded csv file, and there are " + str(len(blank_data_number_list)) + " blank text have been removed "
                                                                                 "from dataset\n" + str(len(features)) +
          " data points in total")
    return features, labels

In [3]:
def remove_url(features):
    count = 0
    for i in features:
        if 'http://' in i[1] or 'https://' in i[1]:
            count += 1
            i[1] = re.sub(r'http\S+', ' ', i[1])
    print("There are "+str(count)+" url have been removed from text")

In [4]:
def remove_newline(features):
    for i in features:
        i[1] = i[1].replace('\n', ' ').replace('\r', ' ').replace('\n\n', ' ')
    print("Newline symbols have been removed from text")

In [5]:
def remove_number(features):
    for i in features:
        i[1] = re.sub(r'\d+', ' ', i[1])
    print("Numbers have been removed from text")

In [6]:
def remove_punctuation(features):
    for i in features:
        i[1] = re.sub('[^a-zA-Z]', ' ', i[1])
    print("Punctuations have been removed from text")

In [7]:
def convert_into_lowercase(features):
    for i in features:
        i[1] = i[1].lower()
    print("All text have been converted into lowercase")

In [8]:
def tokenization(features):
    for i in features:
        i[1] = word_tokenize(i[1])
    print("Preformed tokenization")

In [9]:
def remove_stopwords(features):
    stop_words = set(stopwords.words('english'))
    for i in features:
        i[1] = [words for words in i[1] if not words in stop_words]
    print("Stopwords have been removed")

In [10]:
def normalization(features):
    stemmer = PorterStemmer()
    lemma = WordNetLemmatizer()
    for i in features:
        i[1] = [stemmer.stem(word) for word in i[1]]
    for i in features:
        i[1] = [lemma.lemmatize(word=word, pos='v') for word in i[1]]
    print("Text has been normalized")

In [11]:
def remove_short_words(features):
    for i in features:
        i[1] = [word for word in i[1] if len(word) > 2]
    print("Short words have been removed")

In [12]:
def recover_to_string(features):
    for i in features:
        i[1] = ' '.join(i[1])
    print('The text have been recovered from words to string')

# Clean Data

In [13]:
features_data, labels_data = load_csv("./news.csv")

Loaded csv file, and there are 36 blank text have been removed from dataset
6299 data points in total


In [14]:
remove_url(features_data)

There are 295 url have been removed from text


In [15]:
remove_newline(features_data)

Newline symbols have been removed from text


In [16]:
remove_number(features_data)

Numbers have been removed from text


In [17]:
remove_punctuation(features_data)

Punctuations have been removed from text


In [18]:
convert_into_lowercase(features_data)

All text have been converted into lowercase


In [19]:
tokenization(features_data)

Preformed tokenization


In [20]:
remove_stopwords(features_data)

Stopwords have been removed


In [21]:
normalization(features_data)

Text has been normalized


In [22]:
remove_short_words(features_data)

Short words have been removed


In [23]:
recover_to_string(features_data)

The text have been recovered from words to string


# Drop News Titile

In [24]:
def extract_text(features_data):
    extracted = []
    for i in features_data:
        extracted.append(i[1])
    return extracted

In [25]:
features_data = extract_text(features_data)

# Encode Lable & Split data

In [26]:
def encode_labels(labels):
    labels = np.array(labels)
    labels = LabelEncoder().fit_transform(labels)
    return labels

In [27]:
labels_data = encode_labels(labels_data)

In [28]:
def split_data(features, labels, test_proportion):
    index = int(len(features) * (1 - test_proportion))
    train_x, train_y = np.array(features[:index],dtype=object), np.array(labels[:index],dtype=object)
    test_x, test_y = np.array(features[index:],dtype=object), np.array(labels[index:],dtype=object)
    return (train_x, train_y), (test_x, test_y)

In [29]:
(x_train, y_train), (x_test, y_test) = split_data(features_data, labels_data, 0.1)

In [30]:
y_test = y_test.astype('int')
y_train = y_train.astype('int')

In [31]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(5669,)
(5669,)
(630,)
(630,)


# Training Machine Learning Model

In [32]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [33]:
pipe1 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', MultinomialNB())])
model1 = pipe1.fit(x_train, y_train)
result1 =  model1.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, result1))
print("F1 Score: ", f1_score(y_test, result1, average='micro'))

Accuracy:  0.8333333333333334
F1 Score:  0.8333333333333334


In [34]:
pipe2 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1,  max_iter=300))])
model2 = pipe2.fit(x_train, y_train)
result2 =  model2.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, result2))
print("F1 Score: ", f1_score(y_test, result2, average='micro'))

Accuracy:  0.9317460317460318
F1 Score:  0.9317460317460318


In [35]:
pipe3 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', KNeighborsClassifier(n_neighbors=5))])
model3 = pipe3.fit(x_train, y_train)
result3 =  model3.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, result3))
print("F1 Score: ", f1_score(y_test, result3, average='micro'))

Accuracy:  0.8698412698412699
F1 Score:  0.8698412698412697


In [36]:
pipe4 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', SVC(kernel='linear', random_state=1))])
model4 = pipe4.fit(x_train, y_train)
result4 =  model4.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, result4))
print("F1 Score: ", f1_score(y_test, result4, average='micro'))

Accuracy:  0.9333333333333333
F1 Score:  0.9333333333333333


In [37]:
pipe5 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('model', DecisionTreeClassifier())])
model5 = pipe5.fit(x_train, y_train)
result5 =  model5.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, result5))
print("F1 Score: ", f1_score(y_test, result5, average='micro'))

Accuracy:  0.7904761904761904
F1 Score:  0.7904761904761904
