# Group 27
# Dataset-1
##### fetch_20newgroups

##### multiclassification

# Import Packages

In [None]:
import numpy as np
import pandas as pd
import nltk
import os
import glob
import io
import gensim
import pickle
import re
import string
import spacy
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from collections import defaultdict 
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report
from sklearn import metrics
from time import time
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# Load the Data

In [None]:
categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'rec.autos' ,'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt',  'sci.electronics', 'sci.med','sci.space','misc.forsale', 'talk.politics.misc',
              'talk.politics.guns', 'talk.politics.mideast', 'talk.religion.misc', 'alt.atheism', 'soc.religion.christian']

In [None]:
twenty_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), 
                                  categories=categories, shuffle=True, random_state=42)

twenty_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), 
                                  categories=categories, shuffle=True, random_state=42)

# Preprocessing

In [None]:
def preprocessing(file):
    text_content = []
    exclude = string.punctuation
    exclude = exclude.replace("-", "")
    pattern = r"[{}]".format(exclude)

    for data in file :
        text = re.sub(r"(<br\s*/><br\s*/>)", " ", str(data))
        text = re.sub(pattern, "", str(text))
        text_content.append(text.lower())
    return text_content

In [None]:
twenty_train.data = preprocessing(twenty_train.data)
twenty_test.data = preprocessing(twenty_test.data)

In [None]:
with open('new_text_prep', 'w') as f:
    f.writelines('%s\n' % p for p in twenty_train.data)
    
with open('new_text_prep1', 'w') as f:
    f.writelines('%s\n' % p for p in twenty_test.data)

# Data Spliting to (Training and Validatoin)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(twenty_train.data, twenty_train.target, train_size=0.8, test_size=0.2)

x_test = twenty_test.data
y_test = twenty_test.target


# Vectorization for RandomSearch CV

In [None]:
tf_idf_vectorizer = TfidfVectorizer(min_df=0, analyzer='word', ngram_range=(1, 2), sublinear_tf=True )
Train_set = tf_idf_vectorizer.fit_transform(x_train)
Valid_set = tf_idf_vectorizer.transform(x_valid)
Test_set = tf_idf_vectorizer.transform(x_test)

# AdaBoost RandomSearchCV

In [None]:
ada = AdaBoostRegressor(random_state = 42)
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
learning_rate = [0.01,0.05,0.1,0.3,1]
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
loss = ['linear', 'square', 'exponential']

random_grid = {'learning_rate': learning_rate,
               'n_estimators': n_estimators}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
ada_random = RandomizedSearchCV(estimator = ada, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = 1)

# Fit the random search model
ada_random.fit(Train_set, y_train)
ada_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = AdaBoostRegressor(n_estimators = 200, learning_rate= 0.1, random_state = 42)
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(ada_random, x_test, y_test)

# GridSearch for AdaBoost

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'learning_rate': [0.1,0.3],
    'loss': ['linear', 'square', 'exponential'],
    'n_estimators': [200, 300, 1000]
}
# Create a based model
ada = AdaBoostRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = ada, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(Train_set, y_train)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, x_test, y_test)

# RandomForest RandomSearchCV

In [None]:
rf = RandomForestRegressor(random_state = 42)
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = 1)

# Fit the random search model
rf_random.fit(Train_set, y_train)
rf_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = AdaBoostRegressor(n_estimators = 400, min_samples_split= 5, min_samples_leaf = 4, 
                               max_features= auto, max_depth=10, bootstrap=True)
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(rf_random, x_test, y_test)

# GridSearch for RamdomForest

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [100, 110, 120],
    'max_features': [2],
    'min_samples_leaf': [3, 4],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(Train_set, y_train)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, x_test, y_test)

# AdaBoost Model Building

In [None]:
pipeline = Pipeline([('vect', TfidfVectorizer(min_df=0, analyzer='word', ngram_range=(1, 2), sublinear_tf=True )),
                     ('clf', AdaBoostClassifier(n_estimators=11314, random_state = 42))])

In [None]:
t = time()

model = pipeline.fit(twenty_train.data, twenty_train.target)

print('Time to build: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
pickle.dump(model, open('Adaboost_model_news_newprep', 'wb'))

In [None]:
predicted = model.predict(twenty_test.target.data)
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

# RandomForest Model Building

In [None]:
pipeline = Pipeline([('vect', TfidfVectorizer(min_df=0, analyzer='word', ngram_range=(1, 2), sublinear_tf=True )),
                     ('clf', RandomForestClassifier(n_estimators = 1000, max_features=None,
                            max_depth=None, min_samples_split=2, min_samples_leaf=1, oob_score =False ,n_jobs = 1,
                            bootstrap = True,random_state = 42 ))])

In [None]:
t = time()
model_rf = pipeline.fit(twenty_train.data, twenty_train.target)

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
pickle.dump(model_rf, open('RandomForest_model_news_newprep', 'wb'))

In [None]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))