# Libraries

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

import gdown

In [3]:
train_file_id = '1GmhnQd56OIV-K_umlGjmzWgGXQqjl6rH'
train_output_file = '/content/train.csv'
train_download_url = f'https://drive.google.com/uc?id={train_file_id}'
gdown.download(train_download_url, train_output_file, quiet=False, fuzzy = True)
df = pd.read_csv(train_output_file, encoding = 'ISO-8859-1')

Downloading...
From: https://drive.google.com/uc?id=1GmhnQd56OIV-K_umlGjmzWgGXQqjl6rH
To: /content/train.csv
100%|██████████| 12.4M/12.4M [00:00<00:00, 61.7MB/s]


In [5]:
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,positivity,positivity:confidence,relevance,relevance:confidence,articleid,date,headline,positivity_gold,relevance_gold,text
0,842613455,False,finalized,3,12/5/15 17:48,3.0,0.64,yes,0.64,wsj_398217788,8/14/91,Yields on CDs Fell in the Latest Week,,,NEW YORK -- Yields on most certificates of dep...
1,842613456,False,finalized,3,12/5/15 16:54,,,no,1.0,wsj_399019502,8/21/07,The Morning Brief: White House Seeks to Limit ...,,,The Wall Street Journal Online</br></br>The Mo...
2,842613457,False,finalized,3,12/5/15 1:59,,,no,1.0,wsj_398284048,11/14/91,Banking Bill Negotiators Set Compromise --- Pl...,,,WASHINGTON -- In an effort to achieve banking ...
3,842613458,False,finalized,3,12/5/15 2:19,,0.0,no,0.675,wsj_397959018,6/16/86,Manager's Journal: Sniffing Out Drug Abusers I...,,,The statistics on the enormous costs of employ...
4,842613459,False,finalized,3,12/5/15 17:48,3.0,0.3257,yes,0.64,wsj_398838054,10/4/02,Currency Trading: Dollar Remains in Tight Rang...,,,NEW YORK -- Indecision marked the dollar's ton...


In [4]:
df = df[df.relevance != "not sure"]
df.shape

(7991, 15)

In [6]:
df['relevance'] = df.relevance.map({'yes':1, 'no':0})
df = df[["text","relevance"]]
df = df[:1000]
df.shape

(1000, 2)

# Text Processing and Cleaning

In [10]:
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

nlp = spacy.load('en_core_web_sm')

stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean(df):
    text_no_namedentities = []
    document = nlp(df)
    ents = [e.text for e in document.ents]
    for item in document:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    df = (" ".join(text_no_namedentities))

    df = df.lower().strip()
    df = df.replace("</br>", " ")
    df = df.replace("-", " ")
    df = "".join([char for char in df if char not in string.punctuation and not char.isdigit()])
    df = " ".join([token for token in df.split() if token not in stopwords])
    df = " ".join([lemmatizer.lemmatize(word) for word in df.split()])
    return df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
df['text'] = df['text'].apply(clean)
df.head()

Unnamed: 0,text,relevance
0,new york yield certificate deposit offered maj...,1
1,wall street journal online br morning brief lo...,0
2,effort achieve banking reform negotiator admin...,0
3,statistic enormous cost employee drug abuse we...,0
4,new york indecision marked dollar tone trader ...,1


# TF-IDF

In [13]:
docs = list(df['text'])
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features = 20000)
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)
docs = tfidf_vectorizer_vectors.toarray()

In [14]:
X = docs
y = df['relevance']
print(X.shape, y.shape)

(1000, 11261) (1000,)


## Train Test Split

In [15]:
SEED=123
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(800, 11261) (800,)
(200, 11261) (200,)


In [16]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred_train = gnb.predict(X_train)
y_pred_test = gnb.predict(X_test)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))


Training Accuracy score: 0.99625
Testing Accuracy score: 0.76


In [17]:
print(classification_report(y_test, y_pred_test, target_names=['not relevant', 'relevant']))

              precision    recall  f1-score   support

not relevant       0.77      0.98      0.86       151
    relevant       0.57      0.08      0.14        49

    accuracy                           0.76       200
   macro avg       0.67      0.53      0.50       200
weighted avg       0.72      0.76      0.68       200



In [21]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

y_pred_train = mnb.predict(X_train)
y_pred_test = mnb.predict(X_test)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))


Training Accuracy score: 0.7575
Testing Accuracy score: 0.755


In [22]:
print(classification_report(y_test, y_pred_test, target_names=['not relevant', 'relevant']))

              precision    recall  f1-score   support

not relevant       0.76      1.00      0.86       151
    relevant       0.00      0.00      0.00        49

    accuracy                           0.76       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.57      0.76      0.65       200



In [24]:
lr = LogisticRegression(random_state=SEED)
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))


Training Accuracy score: 0.81375
Testing Accuracy score: 0.77


In [25]:
print(classification_report(y_test, y_pred_test, target_names=['not relevant', 'relevant']))

              precision    recall  f1-score   support

not relevant       0.77      0.99      0.87       151
    relevant       0.80      0.08      0.15        49

    accuracy                           0.77       200
   macro avg       0.78      0.54      0.51       200
weighted avg       0.78      0.77      0.69       200



In [27]:
svc =  LinearSVC(class_weight='balanced')
svc.fit(X_train, y_train)

y_pred_train = svc.predict(X_train)
y_pred_test = svc.predict(X_test)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))


Training Accuracy score: 0.99875
Testing Accuracy score: 0.72


In [28]:
print(classification_report(y_test, y_pred_test, target_names=['not relevant', 'relevant']))

              precision    recall  f1-score   support

not relevant       0.81      0.83      0.82       151
    relevant       0.42      0.39      0.40        49

    accuracy                           0.72       200
   macro avg       0.61      0.61      0.61       200
weighted avg       0.71      0.72      0.72       200



In [29]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=SEED)
dt.fit(X_train, y_train)

y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))


Training Accuracy score: 1.0
Testing Accuracy score: 0.73


In [30]:
print(classification_report(y_test, y_pred_test, target_names=['not relevant', 'relevant']))

              precision    recall  f1-score   support

not relevant       0.80      0.86      0.83       151
    relevant       0.43      0.33      0.37        49

    accuracy                           0.73       200
   macro avg       0.61      0.59      0.60       200
weighted avg       0.71      0.73      0.72       200



In [31]:
from sklearn.ensemble import VotingClassifier

classifiers = [('Decision Tree', dt),
               ('Logistic Regression', lr),
                ('Naive Bayes', gnb)
              ]
vc = VotingClassifier(estimators=classifiers)
# Fit 'vc' to the traing set and predict test set labels
vc.fit(X_train, y_train)
y_pred_train=vc.predict(X_train)
y_pred_test = vc.predict(X_test)
print("Training Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))

Training Accuracy score: 1.0
Testing Accuracy score: 0.775
