In [None]:
# Importing libraries
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV, learning_curve)

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
np.random.seed(42)

In [None]:
# Creating a list of stopwords
stopwords_list = list(stopwords.words('english'))
stopwords_list

In [None]:
# Helper function to display the evaluation metrics of the different models
def show_eval_scores(model, test_set, model_name):
    """Function to show to different evaluation score of the model passed
    on the test set.
    
    Parameters:
    -----------
    model: scikit-learn object
        The model whose scores are to be shown.
    test_set: pandas dataframe
        The dataset on which the score of the model is to be shown.
    model_name: string
        The name of the model.
    """
    y_pred = model.predict(test_set['news'])
    y_true = test_set['label']
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    print('Report for ---> {}'.format(model_name))
    print('Accuracy is: {}'.format(accuracy))
    print('F1 score is: {}'.format(f1))
    print('Precision score is: {}'.format(precision))
    print('Recall score is: {}'.format(recall))

In [None]:
# Importing the datasets
train_data = pd.read_csv('../datasets/train.csv')
valid_data = pd.read_csv('../datasets/valid.csv')
test_data = pd.read_csv('../datasets/test.csv')

#### Viewing random rows of all the datasets

In [None]:
train_data.sample(5)

In [None]:
valid_data.sample(5)

In [None]:
test_data.sample(5)

In [None]:
print('Train dataset size: {}'.format(train_data.shape))
print('Test dataset size: {}'.format(test_data.shape))
print('Valid dataset size: {}'.format(valid_data.shape))

Combining train_data and valid_data into a single training set as GridSearchCV with 5 fold cross validation will be used for hyperparameter tuning the different models

In [None]:
training_set = pd.concat([train_data, valid_data], ignore_index=True)
print('Training set size: {}'.format(training_set.shape))
training_set.sample(5)

Creating a CountVectorizer object and analyzing the training set
Bag-of-Words


In [None]:
countV = CountVectorizer()
train_count = countV.fit_transform(training_set['news'].values)

In [None]:
countV.vocabulary_

In [None]:
len(countV.get_feature_names())

#### Building and tuning Logistic Regression pipeline 

In [None]:
# lr_pipeline = Pipeline([
#     ('lrCV', CountVectorizer(stop_words=stopwords_list)),
#     ('lr_clf', LogisticRegression(random_state=42, n_jobs=-1))
# ])

In [None]:
# param_grid = [
#     {
#         'lrCV__lowercase': [True, False],
#         'lrCV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'lr_clf__C': [0.0001, 0.00005, 0.00001]
#     }
# ]

# lr_gs = GridSearchCV(lr_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# lr_gs.fit(training_set['news'], training_set['label'])

In [None]:
# lr_gs.best_params_

In [None]:
# lr_gs.best_score_

In [None]:
lr_pipeline = Pipeline([
    ('lrCV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 1))),
    ('lr_clf', LogisticRegression(C=0.0001, random_state=42, n_jobs=-1))
])

In [None]:
lr_pipeline.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(lr_pipeline, test_data, 'Logistic Regression Count Vectorizer')

#### Building and tuning Naive Bayes pipeline

In [None]:
# nb_pipeline = Pipeline([
#     ('nb_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('nb_clf', MultinomialNB())
# ])

In [None]:
# param_grid = {
#     'nb_clf__alpha': [i/10.0 for i in range(60, 71)],
#     'nb_CV__lowercase': [True, False],
#     'nb_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)]
# }

# nb_gs = GridSearchCV(nb_pipeline, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)
# nb_gs.fit(training_set['news'], training_set['label'])

In [None]:
nb_gs.best_params_

In [None]:
nb_gs.best_score_

In [None]:
nb_pipeline = Pipeline([
    ('nb_CV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 4))),
    ('nb_clf', MultinomialNB(alpha=6.8))
])

In [None]:
nb_pipeline.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(nb_pipeline, test_data, 'Naive Bayes Count Vectorizer')

#### Building and tuning SVM classifier pipeline

In [None]:
# svm_pipeline = Pipeline([
#     ('svm_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('svm_clf', SVC(random_state=42))
# ])

In [None]:
# param_grid = [
#     {
#         'svm_CV__lowercase': [True, False],
#         'svm_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'svm_clf__kernel': ['poly'],
#         'svm_clf__degree': [1, 2, 3]
#     },
#     {
#         'svm_CV__lowercase': [True, False],
#         'svm_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'svm_clf__kernel': ['rbf'],
#         'svm_clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
#     }
# ]

# svm_gs = GridSearchCV(svm_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# svm_gs.fit(training_set['news'], training_set['label'])

In [None]:
# svm_gs.best_params_

In [None]:
# svm_gs.best_score_

In [None]:
svm_pipeline = Pipeline([
    ('svm_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('svm_clf', SVC(random_state=42, gamma=1.0, kernel='rbf'))
])

In [None]:
svm_pipeline.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(svm_pipeline, test_data, 'SVM Classifier Count Vectorizer')

#### Building and Tuning Random Forest Classifier pipeline 

In [None]:
# rf_pipeline = Pipeline([
#     ('rf_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('rf_clf', RandomForestClassifier(n_jobs=-1, random_state=42))
# ])

In [None]:
# param_grid = {
#     'rf_CV__lowercase': [True, False],
#     'rf_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'rf_clf__n_estimators': [200, 300, 400, 500],
#     'rf_clf__max_depth': [i for i in range(8, 13)],
#     'rf_clf__max_features': ['auto', 'sqrt', 'log2']
# }
# rf_gs = GridSearchCV(rf_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# rf_gs.fit(training_set['news'], training_set['label'])

In [None]:
# rf_gs.best_params_

In [None]:
# rf_gs.best_score_

In [None]:
rf_pipeline = Pipeline([
    ('rf_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('rf_clf', RandomForestClassifier(max_depth=12, n_estimators=300, n_jobs=-1, random_state=42))
])

In [None]:
rf_pipeline.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(rf_pipeline, test_data, 'Random Forest Classifier Count Vectorizer')

#### Building a Voting Classifier using the above created models 

In [None]:
rf_voting_pipeline = Pipeline([
    ('rf_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('rf_clf', RandomForestClassifier(max_depth=12, n_estimators=300, n_jobs=-1, random_state=42))
])

In [None]:
svm_voting_pipeline = Pipeline([
    ('svm_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('svm_clf', SVC(random_state=42, gamma=1.0, kernel='rbf', probability=True))
])

In [None]:
nb_voting_pipeline = Pipeline([
    ('nb_CV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 4))),
    ('nb_clf', MultinomialNB(alpha=6.8))
])

In [None]:
lr_voting_pipeline = Pipeline([
    ('lrCV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 1))),
    ('lr_clf', LogisticRegression(C=0.0001,random_state=42, n_jobs=-1))
])

In [None]:
voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_voting_pipeline), ('nb', nb_voting_pipeline),
    ('svm', svm_voting_pipeline), ('rf', rf_voting_pipeline)], voting='soft', n_jobs=-1)

In [None]:
voting_classifier.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(voting_classifier, test_data, 'Voting Classifier(soft) Count Vectorizer')

#### Saving the voting classifier model for future use

In [None]:
pickle.dump(voting_classifier, open(os.path.join('../models', 'voting_classifier_count_vectorizer.pkl'), 'wb'))

#### Building a Boosting Classifier Model

In [None]:
boosting_classfier = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

In [None]:
boosting_classfier.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(boosting_classfier, test_data, 'Boosting Classifier(XGBOOST) Count Vectorizer')

#### Saving the boosting classifier model for the future use

In [None]:
pickle.dump(boosting_classfier, open(os.path.join('../models', 'boosting_classifier.pkl'), 'wb'))