# Capstone Modelling

In [1]:
# importing libraries
import numpy as np
import random
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Importing visualisation libraries
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

# NLP libraries
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Classifier libraries
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import roc_auc_score

## Loading datasets

In [2]:
# Loading train and validation datasets
y_train = pd.read_csv('./datasets/y_train.csv', index_col=0)
y_validate = pd.read_csv('./datasets/y_validate.csv', index_col=0)

clean_train_posts = pd.read_csv('./datasets/clean_train_posts.csv', index_col=0)['0'].to_list()
clean_validate_posts = pd.read_csv('./datasets/clean_validate_posts.csv', index_col=0)['0'].to_list()

In [3]:
y_train.head()

Unnamed: 0,target
375,1
1309,0
861,0
478,0
362,0


In [4]:
y_validate.head()

Unnamed: 0,target
2021,1
464,0
2027,1
853,0
359,1


In [5]:
len(clean_train_posts)

1856

In [6]:
len(clean_validate_posts)

232

In [7]:
# loading holdout dataset
X_holdout = pd.read_csv('./datasets/X_holdout.csv', index_col=0)
y_holdout = pd.read_csv('./datasets/y_holdout.csv', index_col=0)

In [8]:
X_holdout.head()

Unnamed: 0,name,ticker,dates,price_on_date,price_change_next_yr,sp_on_date,sp_price_change_next_year,reports,price_change_%,sp_price_change_%
1250,Hilton Worldwide Holdings Inc.,HLT,2018-02-14,84.63,-5.9,2698.63,54.4,DocumentUNITED STATES SECURITIES AND EXCHANGE ...,-0.069715,0.020158
1305,HUMANA INC,HUM,2017-02-17,199.83,64.72,2351.16,381.06,DocumentUNITED STATESSECURITIES AND EXCHANGE C...,0.323875,0.162073
178,AMERICAN ELECTRIC POWER CO INC,AEP,2016-02-24,54.51,5.52,1929.8,433.84,10-KUNITED STATESSECURITIES AND EXCHANGE COMMI...,0.101266,0.224811
1821,"NetApp, Inc.",NTAP,2016-06-22,22.83,12.65,2085.45,351.58,ntap-10k_20160429.htmUNITED STATESSECURITIES A...,0.554095,0.168587
592,"CISCO SYSTEMS, INC.",CSCO,2015-09-08,22.59,5.38,1969.41,211.89,10-KTable of ContentsUNITED STATES SECURITIES ...,0.238158,0.107591


In [9]:
y_holdout.head()

Unnamed: 0,target
1250,0
1305,1
178,0
1821,1
592,1


# Classifier 1: Naive Bayes
Baseline: Proportion of target variable: 53.4%

In [10]:
# Baseline accuracy score using proportion
y_validate['target'].value_counts(normalize=True)

1    0.534483
0    0.465517
Name: target, dtype: float64

In [11]:
# Instantiate pipeline 1 and 2
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

pipe2 = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('nb', MultinomialNB())
])


In [12]:
# Countvectorizer->Naive Bayes hyperparameters
pipe_params1 = {
    
    'cvec__max_features': [7500], # 10000
    'cvec__min_df': [2], #3
    'cvec__max_df': [0.9], #0.8, 0.95
    'cvec__ngram_range': [(1,1)], #(1,2)
    'nb__alpha' : [1] #0.9
    
    
}

pipe_params2 =  {
    
    'tfid__max_features': [10_000],
    'tfid__min_df': [3],
    'tfid__max_df': [0.95],
    'tfid__ngram_range': [(1,2)],
    'nb__alpha' : [0.9]
    
}

In [13]:
#Instantiating both gridsearchCV
gs1 = GridSearchCV(pipe1, param_grid=pipe_params1, cv=5)
gs2 = GridSearchCV(pipe2, param_grid=pipe_params2, cv=5)

### gs1

In [14]:
#Fitting gs1 countvectorizer->multinomial naive bayes
gs1.fit(clean_train_posts, np.ravel(y_train))

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [15]:
# best parameters for gs1
gs1.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 7500,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'nb__alpha': 1}

In [16]:
# gs1 score of training data
gs1.score(clean_train_posts, y_train)

0.7015086206896551

In [34]:
# gs1 score of validation data
gs1.score(clean_validate_posts, y_validate)

0.49137931034482757

In [17]:
# prediction of gs1 model on test data
predict_1 = gs1.predict(clean_validate_posts)

In [18]:
# confusion matrix
confusion_matrix(predict_1, y_validate)

array([[44, 54],
       [64, 70]], dtype=int64)

In [19]:
tn, fp, fn, tp = confusion_matrix(predict_1, y_validate).ravel()

In [20]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 44
False Positives: 54
False Negatives: 64
True Positives: 70


In [21]:
# calculating precision and sensitivty of gs1
Precision = tp/(tp+fp)
Sensitivity = tp/(tp+fn)

print('Precision of gs1 is: %s' % round(Precision,2))
print('Sensitivity of gs1 is: %s' % round(Sensitivity,2))

Precision of gs1 is: 0.56
Sensitivity of gs1 is: 0.52


### gs2

In [22]:
#Fitting gs2 TfID-vectorizer->multinomial naive bayes
gs2.fit(clean_train_posts, np.ravel(y_train))

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfid',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [23]:
# best parameters for gs2
gs2.best_params_

{'nb__alpha': 0.9,
 'tfid__max_df': 0.95,
 'tfid__max_features': 10000,
 'tfid__min_df': 3,
 'tfid__ngram_range': (1, 2)}

In [24]:
# scoring gs2 on the training set
gs2.score(clean_train_posts, y_train)

0.6589439655172413

In [27]:
# scoring gs2 on the test set
gs2.score(clean_validate_posts, y_validate)

0.5258620689655172

In [28]:
# predicitng output on test set using gs2 model
predict_2 = gs2.predict(clean_validate_posts)

In [30]:
# confusion matrix of gs2
confusion_matrix(predict_2, y_validate)

array([[37, 39],
       [71, 85]], dtype=int64)

In [31]:
tn, fp, fn, tp = confusion_matrix(predict_2, y_validate).ravel()

In [32]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 37
False Positives: 39
False Negatives: 71
True Positives: 85


In [33]:
# calulcating precision and sensitivity of gs2
Precision = tp/(tp+fp)
Sensitivity = tp/(tp+fn)

print('Precision of gs1 is: %s' % round(Precision,2))
print('Sensitivity of gs1 is: %s' % round(Sensitivity,2))

Precision of gs1 is: 0.69
Sensitivity of gs1 is: 0.54
