In [55]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [56]:
df = pd.read_csv(r"C:\Users\Gopinadh\Documents\uci_news_aggregator.csv", error_bad_lines=False)
df.head(5)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [57]:
documents = df[['TITLE', 'CATEGORY']] 
documents['index'] = documents.index
documents.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


422419

In [58]:
documents.head(5)

Unnamed: 0,TITLE,CATEGORY,index
0,"Fed official says weak data caused by weather,...",b,0
1,Fed's Charles Plosser sees high bar for change...,b,1
2,US open: Stocks fall after Fed official hints ...,b,2
3,"Fed risks falling 'behind the curve', Charles ...",b,3
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b,4


In [59]:
print(documents.groupby('CATEGORY').size())
print("unique targets: " +documents.CATEGORY.unique())

CATEGORY
b    115967
e    152469
m     45639
t    108344
dtype: int64
['unique targets: b' 'unique targets: t' 'unique targets: e'
 'unique targets: m']


In [60]:
documents['CATEGORY'] = df.CATEGORY.map({'b':0,'e':1,'m':2,'t':3})
outcomes = documents['CATEGORY']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


**Performing data preprocessing**


*   Tokenization - splits the text into sentences and sentences into words


*   Lower case and remove punctuation
*    remove words that have fewer than 3 characters


*   remove stopwords

*   Lemmatization - words are lemmatized, which is third person are changed to single person and verbs in future and past are changed into present.
*  Stemming - words are reduced to its stem/root.



In [61]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2018)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gopinadh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
#function to perform lemmatize and stem preprocessing steps on the data set.
stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
  
def preprocess(text):
    clean_words = [lemmatize_stemming(token) for token in gensim.utils.simple_preprocess(text) if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3]
    return ' '.join(clean_words)

In [63]:
#Selecting documents to preview after preprocessing
doc_sample = documents[documents['index'] == 50000].values[0][0]
print('original question: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized question: ')
print(preprocess(doc_sample))

original question: 
['Girl', 'Who', 'Shaved', 'Head', 'for', 'Friend', 'with', 'Cancer', 'is', 'Allowed', 'Back', 'in', 'School']


 tokenized and lemmatized question: 
girl shave head friend cancer allow school


In [64]:
processed_docs = documents['TITLE'].map(preprocess)

In [65]:
processed_docs.head(5)

0    offici say weak data caus weather slow taper
1         charl plosser see high chang pace taper
2       open stock fall offici hint acceler taper
3                risk fall curv charl plosser say
4               plosser nasti weather curb growth
Name: TITLE, dtype: object

In [66]:
#Document term matrix
from sklearn.feature_extraction.text import CountVectorizer
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_docs,outcomes, random_state=42)

print('Number of rows in the total set: {}'.format(documents.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 422419
Number of rows in the training set: 316814
Number of rows in the test set: 105605


In [69]:
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)
#count_vector.get_feature_names()
# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report
rfc = RandomForestClassifier()

In [72]:
rfc.fit(training_data, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
prediction_train = rfc.predict(training_data)
prediction_test = rfc.predict(testing_data)

In [74]:
from sklearn.metrics import accuracy_score, confusion_matrix
acc_score_train = accuracy_score(prediction_train,y_train)
print("accuracy_score_training:",acc_score_train)
acc_score_test = accuracy_score(prediction_test,y_test)
print("accuracy_score_testing:",acc_score_test)
#f1_score_train = f1_score(prediction_train,y_train,average=None)
#print("f1_score_training:",f1_score_train)
#f1_score_test = f1_score(prediction_test,y_test,average=None)
#print("f1_score_testing:",f1_score_test)
cm_score = confusion_matrix(prediction,y_test)
print("cm_score: \n {}".format(cm_score))

accuracy_score_training: 0.9921247167107514
accuracy_score_testing: 0.9207139813455802
cm_score: 
 [[26700   944   735  1958]
 [  618 36282   382   685]
 [  370   261  9968   256]
 [ 1429   501   234 24282]]


In [75]:
target_names = ['class 0','class 1','class 2','class 3']
print(classification_report(prediction_test, y_test, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.92      0.88      0.90     30337
     class 1       0.96      0.96      0.96     37967
     class 2       0.88      0.92      0.90     10855
     class 3       0.89      0.92      0.91     26446

   micro avg       0.92      0.92      0.92    105605
   macro avg       0.91      0.92      0.91    105605
weighted avg       0.92      0.92      0.92    105605



In [None]:
params = {'max_depth': range(1,10), 'criterion': ['gini', 'entropy'], 'n_estimators': [5,10,15,20], 'min_samples_leaf': range(1,10), 'min_samples_split': [2,5,10]}
#scorer = make_scorer(f1_score, average=None)
grid_obj = GridSearchCV(rfc,params)
grid_fit = grid_obj.fit(training_data, y_train)
best_clf = grid_fit.best_estimator_
best_clf



In [None]:
unseen_document = 'The Pale Red Dot --Distant Oort Cloud Planet Discovered Beyond Known Edge'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))