# Overview of Notebook

This notebook is split into 3 sections
- Importing of Libraries
- Processing the article(s) data and scoring it based on a predefined risk scoring framework
- Plotting the article(s) data in a 3D Risk Scoring Cube

Outputs:
- CSV file with 3 additional columns: "test.csv"
    - New Severity Score
    - New Human Control Score
    - New Likelihood Score
- A image of the 3D Risk Scoring Cube: "3D_Risk_Scoring_Cube.png"

In [1]:
# Import libraries

import pandas as pd
import re 

import nltk 
from nltk.tokenize import RegexpTokenizer
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost 
from sklearn.metrics  import classification_report
from sklearn import metrics
import time


In [2]:
df = pd.read_csv("Consolidated_News_Articles_With_Sentiment_Labels.csv")

In [3]:
def preprocess(text):
    
    """
    Function: split text into words and return the root form of the words
    Args:
      text(str): the article
    Return:
      lem(list of str): a list of the root form of the article words
    """
        
    # Normalize text
    text = re.sub(r"[^a-zA-Z]", " ", str(text).lower())
    
    # Tokenize text
    token = word_tokenize(text)
    
    # Remove stop words
    stop = stopwords.words("english")
    new_stop_words_list = ['said', 'us', 'also', 'mr']
    stop.extend(new_stop_words_list)
    words = [t for t in token if t not in stop]
    
    # Lemmatization
    lem = [WordNetLemmatizer().lemmatize(w) for w in words]
    
    return lem

df["Preprocessed_Text"] = df['content summary'].apply(lambda x: preprocess(x))

df['Preprocessed_Text2'] = df['Preprocessed_Text'].apply(' '.join)

X = df['Preprocessed_Text2']
y = df['new_class_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

tf_vec = TfidfVectorizer()
train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
test_features = tf_vec.transform(X_test)
train_features

<10997x34861 sparse matrix of type '<class 'numpy.float64'>'
	with 632921 stored elements in Compressed Sparse Row format>

In [4]:
df.head()

Unnamed: 0,date,location,news title,news source(url),content summary,keywords,class_name,new_class_name,Preprocessed_Text,Preprocessed_Text2,sentiment score,compound score,sentiment label
0,Feb-22,MUMBAI (REUTERS),Himalayan avalanche kills seven Indian soldier...,https://www.straitstimes.com/asia/himalayan-av...,MUMBAI (REUTERS) - A Himalayan avalanche kille...,"kills, avalanche, defence, kameng, china, sold...",natural calamities,geophysical event,"[mumbai, reuters, himalayan, avalanche, killed...",mumbai reuters himalayan avalanche killed seve...,"{'neg': 0.148, 'neu': 0.773, 'pos': 0.079, 'co...",-0.7506,negative
1,Feb-22,ZURICH (REUTERS),Eight killed in two days after third deadly av...,https://www.straitstimes.com/world/europe/eigh...,ZURICH (REUTERS) - One person was killed and f...,"person, avalanche, tyrol, killed, days, skiers...",natural calamities,geophysical event,"[zurich, reuters, one, person, killed, four, o...",zurich reuters one person killed four others i...,"{'neg': 0.276, 'neu': 0.63, 'pos': 0.094, 'com...",-0.9665,negative
2,Feb-22,VIENNA (REUTERS),Avalanche in Austria near Swiss border kills five,https://www.straitstimes.com/world/europe/aval...,VIENNA (REUTERS) - An avalanche in an area of ...,"kills, avalanche, person, services, supervisor...",natural calamities,geophysical event,"[vienna, reuters, avalanche, area, austria, bo...",vienna reuters avalanche area austria borderin...,"{'neg': 0.258, 'neu': 0.607, 'pos': 0.135, 'co...",-0.91,negative
3,Feb-22,NEW DELHI (REUTERS),Himalayan avalanche traps Indian Army patrol t...,https://www.straitstimes.com/asia/south-asia/h...,NEW DELHI (REUTERS) - A Himalayan avalanche tr...,"arunachal, avalanche, army, defence, team, chi...",natural calamities,geophysical event,"[new, delhi, reuters, himalayan, avalanche, tr...",new delhi reuters himalayan avalanche trapped ...,"{'neg': 0.045, 'neu': 0.782, 'pos': 0.172, 'co...",0.8225,positive
4,Feb-21,SALT LAKE CITY (NYTIMES),"4 skiers killed in avalanche in Utah, official...",https://www.straitstimes.com/world/united-stat...,SALT LAKE CITY (NYTIMES) - Four back-country s...,"avalanche, states, killed, skiers, saidthe, sa...",natural calamities,geophysical event,"[salt, lake, city, nytimes, four, back, countr...",salt lake city nytimes four back country skier...,"{'neg': 0.258, 'neu': 0.703, 'pos': 0.039, 'co...",-0.9509,negative


In [5]:
import pickle
file_name = "gb_reg.pkl"
gb = pickle.load(open(file_name, "rb"))



In [6]:
def classify_article(path):
    
    """
    Function: classify an article.
    Args:
      path: the path of the article 
    Return:
      category (str): the category of the article
    """
    # Read file
    file = open(path, 'r')
    artcl = file.read()

    # Text preprocessing
    artcl = preprocess(artcl)
    artcl = ' '.join(artcl)

    # Use TF_IDF
    test = tf_vec.transform([artcl])

    # Use GradientBoosting model to classify the article
    predict = gb.predict(test)
    predict_score = gb.predict_proba(test)
    sorted_predict_score = sorted(predict_score.tolist()[0], reverse=True)
    if (sorted_predict_score[0] < 0.56) :
        category = "Not related"
    else:
        category = predict[0]

    # Close file
    file.close()

    return category, predict_score[0]

In [7]:
print(classify_article('art1.txt'))

('idiosyncratic', array([0.0082724 , 0.02015195, 0.00854072, 0.01126903, 0.01010479,
       0.0031645 , 0.00922051, 0.00380629, 0.00558697, 0.00239477,
       0.01321341, 0.88174973, 0.02252494]))
