In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# NLP
import nltk
import spacy

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from utils import DataCleaner, model_metrics

In [None]:
sp = spacy.load('en_core_web_sm')

#### LOAD AND CLEAN DATA

In [2]:
import os
path = os.getcwd() + "/data.csv"

In [4]:
""" LOAD DATA SET INTO PANDAS DATAFRAME """
data_set = pd.read_csv(path)[['TITLE', 'TAG']]
data_set = data_set.drop_duplicates()
data_set = data_set.groupby('TAG').filter(lambda x: len(x) > 1000)
data_set['TAG'].value_counts()

Pakistan      11423
World          9103
Sports         6004
Newspaper      5859
Business       4187
Name: TAG, dtype: int64

In [5]:
dc = DataCleaner()
data_set['TITLE'] = data_set['TITLE'] = data_set['TITLE'].apply(lambda x: dc.data_cleaning(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


#### PREPARE DATA FOR MODELING

In [6]:
tf  = TfidfVectorizer(max_features=5000)
train = tf.fit_transform(data_set['TITLE']).toarray()
test = LabelEncoder().fit_transform(data_set['TAG'])

train_x, test_x, train_y, test_y = train_test_split(train, test, test_size=0.2, random_state=0)

#### Modeling and Evaluation (NAIVE BAYES)

In [38]:
mlnb = MultinomialNB()
mlnb.fit(train_x, train_y)

MultinomialNB()

In [39]:
model_metrics(mlnb, test_x, test_y, train_y, train_x)

Unnamed: 0,test,train
accuracy,72.279934,75.963773
precision,67.084981,73.922726
recall,67.238677,71.651733
f1,65.432465,70.174809
sensitivity,86.482085,92.380576
specificity,46.551724,61.173533
score,72.279934,75.963773


In [40]:
pickle.dump(mlnb, open('model_multinomial_nb.pkl', 'wb'))

#### Modeling and Evaluation (DECISION TREE)

In [42]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_features="auto", criterion="entropy")
dt.fit(train_x, train_y)

DecisionTreeClassifier(criterion='entropy', max_features='auto')

In [43]:
pickle.dump(dt, open('model_dt.pkl', 'wb'))

In [44]:
model_metrics(dt, test_x, test_y, train_y, train_x)

Unnamed: 0,test,train
accuracy,57.490432,99.521531
precision,54.919856,99.291883
recall,54.341774,99.522666
f1,54.54836,99.403321
sensitivity,77.568493,100.0
specificity,67.981439,99.893775
score,57.490432,99.521531
