In [None]:
# Importing libraries
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV, learning_curve)
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
np.random.seed(42)

In [None]:
# Creating a list of stopwords
stopwords_list = list(stopwords.words('english'))
stopwords_list

In [None]:
# Helper function to display the evaluation metrics of the different models
def show_eval_scores(model, test_set, model_name):
    """Function to show to different evaluation score of the model passed
    on the test set.
    
    Parameters:
    -----------
    model: scikit-learn object
        The model whose scores are to be shown.
    test_set: pandas dataframe
        The dataset on which the score of the model is to be shown.
    model_name: string
        The name of the model.
    """
    y_pred = model.predict(test_set['news'])
    y_true = test_set['label']
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    print('Report for ---> {}'.format(model_name))
    print('Accuracy is: {}'.format(accuracy))
    print('F1 score is: {}'.format(f1))
    print('Precision score is: {}'.format(precision))
    print('Recall score is: {}'.format(recall))

In [None]:
# Importing the datasets
train_data = pd.read_csv('../datasets/train.csv')
valid_data = pd.read_csv('../datasets/valid.csv')
test_data = pd.read_csv('../datasets/test.csv')

Viewing random rows of all the datasets

In [None]:
train_data.sample(5)

In [None]:
valid_data.sample(5)

In [None]:
test_data.sample(5)

In [None]:
print('Train dataset size: {}'.format(train_data.shape))
print('Valid dataset size: {}'.format(valid_data.shape))
print('Test dataset size: {}'.format(test_data.shape))

Combining train_data and valid_data into a single training set as GridSearchCV with 5 fold cross validation will be used for hyperparameter tuning the different models

In [None]:
training_set = pd.concat([train_data, valid_data], ignore_index=True)
print('Training set size: {}'.format(training_set.shape))
training_set.sample(5)

Creating a TfidfVectorizer object and analyzing the training set

In [None]:
tfidf_V = TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)
train_count = tfidf_V.fit_transform(training_set['news'].values)

In [None]:
tfidf_V.vocabulary_

In [None]:
len(tfidf_V.get_feature_names())

#### Building and tuning Logistic Regression pipeline 

In [None]:
lr_pipeline = Pipeline([
    ('lr_TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('lr_clf', LogisticRegression(random_state=42, n_jobs=-1))
])

In [None]:
# param_grid = {
#     'lr_TF__lowercase': [True, False],
#     'lr_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'lr_clf__C': [i/10.0 for i in range(10, 21)]
# }

# lr_gs = GridSearchCV(lr_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# lr_gs.fit(training_set['news'], training_set['label'])

In [None]:
# lr_gs.best_score_

In [None]:
# lr_gs.best_params_

In [None]:
lr_pipeline = Pipeline([
    ('lr_TF', TfidfVectorizer(lowercase=False, ngram_range=(1, 5), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('lr_clf', LogisticRegression(C=1.0, random_state=42, n_jobs=-1))
])

In [None]:
lr_pipeline.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(lr_pipeline, test_data, 'Logistic Regression TFIDF Vectorizer')

#### Building and tuning Naive Bayes pipeline 

In [None]:
# nb_pipeline = Pipeline([
#     ('nb_TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
#     ('nb_clf', MultinomialNB())
# ])

In [None]:
# param_grid = {
#     'nb_TF__lowercase': [True, False],
#     'nb_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'nb_clf__alpha': [i/10.0 for i in range(20, 31)]
# }

# nb_gs = GridSearchCV(nb_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# nb_gs.fit(training_set['news'], training_set['label'])

In [None]:
# nb_gs.best_score_

In [None]:
# nb_gs.best_params_

In [None]:
nb_pipeline = Pipeline([
    ('nb_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('nb_clf', MultinomialNB(alpha=2.0))
])

In [None]:
nb_pipeline.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(nb_pipeline, test_data, 'Naive Bayes TFIDF Vectorizer')

#### Building and Tuning SVM classifier pipeline 

In [None]:
# svm_pipeline = Pipeline([
#     ('svm_TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
#     ('svm_clf', SVC(random_state=42))
# ])

In [None]:
# param_grid = [
#     {
#         'svm_TF__lowercase': [True, False],
#         'svm_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#         'svm_clf__kernel': ['poly'],
#         'svm_clf__degree': [1, 2, 3]
#     },
#     {
#         'svm_TF__lowercase': [True, False],
#         'svm_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#         'svm_clf__kernel': ['rbf'],
#         'svm_clf__gamma': [i/100.0 for i in range(10, 21)]
#     }
# ]

# svm_gs = GridSearchCV(svm_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# svm_gs.fit(training_set['news'], training_set['label'])

In [None]:
# svm_gs.best_score_

In [None]:
# svm_gs.best_params_

In [None]:
svm_pipeline = Pipeline([
    ('svm_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('svm_clf', SVC(gamma=0.2, kernel='rbf', random_state=42))
])

In [None]:
svm_pipeline.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(svm_pipeline, test_data, 'SVM Classifier TFIDF Vectorizer')

#### Building and tuning Random Forest Classifier pipeline 

In [None]:
# rf_pipeline = Pipeline([
#     ('rf_TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
#     ('rf_clf', RandomForestClassifier(random_state=42, n_jobs=-1))
# ])

In [None]:
# param_grid = {
#     'rf_TF__lowercase': [True, False],
#     'rf_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'rf_clf__n_estimators': [100, 200, 300, 400, 500],
#     'rf_clf__max_depth': [i for i in range(8, 16)],
#     'rf_clf__max_features': ['auto', 'sqrt', 'log2']
# }

# rf_gs = GridSearchCV(rf_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# rf_gs.fit(training_set['news'], training_set['label'])

In [None]:
# rf_gs.best_score_

In [None]:
# rf_gs.best_params_

In [None]:
rf_pipeline = Pipeline([
    ('rf_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('rf_clf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))
])

In [None]:
rf_pipeline.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(rf_pipeline, test_data, 'Random Forest Classifier TFIDF Vectorizer')

#### Building a Voting Classifier using the above created models 

In [None]:
lr_voting_pipeline = Pipeline([
    ('lr_TF', TfidfVectorizer(lowercase=False, ngram_range=(1, 5), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('lr_clf', LogisticRegression(C=1.0, random_state=42, n_jobs=-1))
])

In [None]:
nb_voting_pipeline = Pipeline([
    ('nb_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('nb_clf', MultinomialNB(alpha=2.0))
])

In [None]:
svm_voting_pipeline = Pipeline([
    ('svm_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('svm_clf', SVC(gamma=0.2, kernel='rbf', random_state=42, probability=True))
])

In [None]:
rf_voting_pipeline = Pipeline([
    ('rf_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('rf_clf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))
])

In [None]:
voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_voting_pipeline), ('nb', nb_voting_pipeline),
    ('svm', svm_voting_pipeline), ('rf', rf_voting_pipeline)], voting='soft', n_jobs=-1)

In [None]:
voting_classifier.fit(training_set['news'], training_set['label'])

In [None]:
show_eval_scores(voting_classifier, test_data, 'Voting Classifier(soft) TFIDF Vectorizer')

#### Saving the voting classifier for future use

In [None]:
pickle.dump(voting_classifier, open(os.path.join('../models', 'voting_classifier_tfidf_vectorizer.pkl'), 'wb'))