In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import time

In [2]:
df = pd.read_csv(r"C:\Users\Gopinadh\Documents\uci_news_aggregator.csv", error_bad_lines=False)
df.head(5)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [3]:
documents = df[['TITLE', 'CATEGORY']] 
documents['index'] = documents.index
documents.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


422419

In [4]:
documents.head(5)

Unnamed: 0,TITLE,CATEGORY,index
0,"Fed official says weak data caused by weather,...",b,0
1,Fed's Charles Plosser sees high bar for change...,b,1
2,US open: Stocks fall after Fed official hints ...,b,2
3,"Fed risks falling 'behind the curve', Charles ...",b,3
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b,4


In [5]:
print(documents.groupby('CATEGORY').size())
print("unique targets: " +documents.CATEGORY.unique())

CATEGORY
b    115967
e    152469
m     45639
t    108344
dtype: int64
['unique targets: b' 'unique targets: t' 'unique targets: e'
 'unique targets: m']


In [6]:
documents['CATEGORY'] = df.CATEGORY.map({'b':0,'e':1,'m':2,'t':3})
outcomes = documents['CATEGORY']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


**Performing data preprocessing**


*   Tokenization - splits the text into sentences and sentences into words


*   Lower case and remove punctuation
*    remove words that have fewer than 3 characters


*   remove stopwords

*   Lemmatization - words are lemmatized, which is third person are changed to single person and verbs in future and past are changed into present.
*  Stemming - words are reduced to its stem/root.



In [7]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2018)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gopinadh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
#function to perform lemmatize and stem preprocessing steps on the data set.
stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
  
def preprocess(text):
    clean_words = [lemmatize_stemming(token) for token in gensim.utils.simple_preprocess(text) if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3]
    return ' '.join(clean_words)

In [9]:
#Selecting documents to preview after preprocessing
doc_sample = documents[documents['index'] == 50000].values[0][0]
print('original question: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized question: ')
print(preprocess(doc_sample))

original question: 
['Girl', 'Who', 'Shaved', 'Head', 'for', 'Friend', 'with', 'Cancer', 'is', 'Allowed', 'Back', 'in', 'School']


 tokenized and lemmatized question: 
girl shave head friend cancer allow school


In [10]:
processed_docs = documents['TITLE'].map(preprocess)

In [11]:
processed_docs.head(5)

0    offici say weak data caus weather slow taper
1         charl plosser see high chang pace taper
2       open stock fall offici hint acceler taper
3                risk fall curv charl plosser say
4               plosser nasti weather curb growth
Name: TITLE, dtype: object

In [12]:
#Document term matrix
from sklearn.feature_extraction.text import CountVectorizer
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_docs,outcomes, random_state=42)

print('Number of rows in the total set: {}'.format(documents.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 422419
Number of rows in the training set: 316814
Number of rows in the test set: 105605


In [14]:
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)
#count_vector.get_feature_names()
# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report, roc_auc_score
rfc = RandomForestClassifier()
nb = MultinomialNB()

**Multinomial NB**

In [18]:
nb.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
nb_predicitions_train = nb.predict(training_data)
nb_predicitions_test = nb.predict(testing_data)

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix
acc_nb_train = accuracy_score(nb_predicitions_train,y_train)
print("accuracy_nb_training:",acc_nb_train)
acc_nb_test = accuracy_score(nb_predicitions_test,y_test)
print("accuracy_nb_testing:",acc_nb_test)

accuracy_nb_training: 0.9195237584197669
accuracy_nb_testing: 0.9116140334264476


In [21]:
target_names = ['class 0','class 1','class 2','class 3']
print(classification_report(nb_predicitions_test, y_test, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.89      0.89      0.89     29311
     class 1       0.95      0.96      0.95     37692
     class 2       0.90      0.90      0.90     11228
     class 3       0.89      0.88      0.89     27374

   micro avg       0.91      0.91      0.91    105605
   macro avg       0.91      0.91      0.91    105605
weighted avg       0.91      0.91      0.91    105605



**Logistic Regression**

In [54]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class = 'ovr')
lr.fit(training_data, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [55]:
lr_predicitions_train = lr.predict(training_data)
lr_predicitions_test = lr.predict(testing_data)

In [56]:
acc_lr_train = accuracy_score(lr_predicitions_train,y_train)
print("accuracy_lr_training:",acc_lr_train)
acc_lr_test = accuracy_score(lr_predicitions_test,y_test)
print("accuracy_lr_testing:",acc_lr_test)

accuracy_lr_training: 0.9463691629789087
accuracy_lr_testing: 0.9323706263907959


In [57]:
target_names = ['class 0','class 1','class 2','class 3']
print(classification_report(lr_predicitions_test, y_test, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.92      0.90      0.91     29817
     class 1       0.97      0.97      0.97     38086
     class 2       0.91      0.95      0.93     10851
     class 3       0.91      0.92      0.91     26851

   micro avg       0.93      0.93      0.93    105605
   macro avg       0.93      0.93      0.93    105605
weighted avg       0.93      0.93      0.93    105605



**Random Forest Classifier**

In [25]:
t0 = time.time()
rfc.fit(training_data, y_train)
t1 = time.time()
print("--- %s seconds ---" % (t1 - t0))



--- 487.11896800994873 seconds ---


In [28]:
rfc_predictions_train = rfc.predict(training_data)
rfc_predictions_test = rfc.predict(testing_data)

In [29]:
acc_rfc_train = accuracy_score(rfc_predictions_train,y_train)
print("accuracy_rfc_training:",acc_rfc_train)
acc_rfc_test = accuracy_score(rfc_predictions_test,y_test)
print("accuracy_rfc_testing:",acc_rfc_test)

accuracy_rfc_training: 0.9921247167107514
accuracy_rfc_testing: 0.9207139813455802


In [31]:
target_names = ['class 0','class 1','class 2','class 3']
print(classification_report(rfc_predictions_test, y_test, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.92      0.88      0.90     30337
     class 1       0.96      0.96      0.96     37967
     class 2       0.88      0.92      0.90     10855
     class 3       0.89      0.92      0.91     26446

   micro avg       0.92      0.92      0.92    105605
   macro avg       0.91      0.92      0.91    105605
weighted avg       0.92      0.92      0.92    105605



In [61]:
#params = {'max_depth': range(1,10), 'criterion': ['gini', 'entropy'], 'n_estimators': [10,15,20], 'min_samples_leaf': range(1,10), 'min_samples_split': [2,5,10]}
params = {
  "estimator__n_estimators": [10,15,20],
  "estimator__criterion": ['gini', 'entropy'],
  "estimator__max_depth" : range(1,10),
  "estimator__min_samples_leaf" : range(1,10),
  "estimator__min_samples_split" : [2, 5, 10],
}
#scorers = {'f1_score': make_scorer(f1_score, average=None)}
model_to_tune = OneVsRestClassifier(estimator = RandomForestClassifier(random_state=0))
grid_obj = GridSearchCV(model_to_tune, params, n_jobs=2)
t0 = time.time()
grid_fit = grid_obj.fit(training_data, y_train)
t1 = time.time()
print("--- %s seconds ---" % (t1 - t0))
best_clf = grid_fit.best_estimator_
print(grid_fit.best_params_)
best_clf



KeyboardInterrupt: 

In [62]:
unseen_document = 'The Pale Red Dot --Distant Oort Cloud Planet Discovered Beyond Known Edge'
bow_vector = preprocess(unseen_document)
unseen_testing_data = count_vector.transform(bow_vector)
x = nb.predict(unseen_testing_data)
y = lr.predict(unseen_testing_data)
z = rfc.predict(unseen_testing_data)
print("{},{},{}".format(x,y,z))

ValueError: Iterable over raw text documents expected, string object received.