<a href="https://colab.research.google.com/github/WilliamYkZhang/COMP551_A2/blob/master/model_selection_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import pandas as pd

# Transformers 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.decomposition import TruncatedSVD

# Models 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Module to write final params 
import csv
import datetime
import pickle 

In [2]:
# Download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
# Get a list of stopwords
stopwords = stopwords.words("english")

# Transformers 
c_vect = CountVectorizer(lowercase=True, encoding="utf-8", decode_error="ignore", strip_accents='unicode',stop_words=stopwords, analyzer = "word")
tfidf_vect = TfidfVectorizer(lowercase=True, encoding = "utf-8",  decode_error = 'ignore', strip_accents='unicode', stop_words=stopwords, analyzer = "word")  
tfidf_trans = TfidfTransformer()
svd = TruncatedSVD()
nml = Normalizer()

# Estimators 
log_reg = LogisticRegression()
svc = SVC() # class weight , experiement values 
xgb_clf = xgb.XGBClassifier(objective='multi:softmax')
decision_tree_clf = DecisionTreeClassifier()
rff = RandomForestClassifier()
multi_NB = MultinomialNB()



In [0]:
# Building pipeline 
pipeline_cvect = Pipeline([('cvect', c_vect), ('clf', multi_NB)], verbose=True)
pipeline_cvect_svd = Pipeline([('cvect', c_vect),('svd', svd), ("nml", nml), ('clf', multi_NB)], verbose=True)
pipeline_tfidf = Pipeline([('tfidf', tfidf_vect), ('clf', xgb_clf)], verbose=True)
pipeline_tfidf_svd = Pipeline([('tfidf', tfidf_vect), ('svd', svd), ("nml", nml), ('clf', multi_NB)], verbose=True)
pipeline_cvect_tfidf = Pipeline([('cvect', c_vect),('tfidf', tfidf_trans), ('kbest', SelectKBest()), ('clf', multi_NB)], verbose=True)

# Instantiate parameters for pipeline     
parameters_cvect = {
    'cvect__max_df': (0.5, 0.75, 1.0),
    'cvect__max_features': (None, 5000, 10000, 50000),
    'cvect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cvect__max_df': (0.5, 0.75, 0.9), # ignore terms that have a document frequency strictly higher than the given threshold
    'cvect__min_df': (0.025, 0.05, 0.1), #  ignore terms that have a document frequency strictly lower than the given threshold
    'clf__alpha': (0.25, 0.5, 0.75),
    'clf__fit_prior': (True, False),   
    }

parameters_cvect_svd = {
    'cvect__max_df': (0.5, 0.75, 1.0),
    'cvect__max_features': (None, 5000, 10000, 50000),
    'cvect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cvect__max_df': (0.5, 0.75, 0.9), # ignore terms that have a document frequency strictly higher than the given threshold
    'cvect__min_df': (0.025, 0.05, 0.1), #  ignore terms that have a document frequency strictly lower than the given threshold
    'svd__n_components' : (10, 500,1000,2500, 5000, 7500, 10000),
    'svd__algorithm': ("arpack", "randomized"),
    'nml__norm' : ('l1', 'l2', 'max'),
    'clf__alpha': (0.25, 0.5, 0.75),
    'clf__fit_prior': (True, False),   
    }

parameters_tfidf = {
    'tfidf__max_features': (None, 10000, 25000, 50000),
    'tfidf__use_idf': (True, False), # Enable inverse-document-frequency reweighting.
    'tfidf__max_df': (0.5, 0.75, 0.9), # ignore terms that have a document frequency strictly higher than the given threshold
    'tfidf__min_df': (0.025, 0.05, 0.1), #  ignore terms that have a document frequency strictly lower than the given threshold
    'tfidf__norm': ('l1', 'l2', None), # regularization term
    'tfidf__smooth_idf': (True, False), # Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once.Prevents zero divisions
    'tfidf__ngram_range': ((1, 1), (1, 2)), # n-grams to be extracted     
}  

parameters_tfidf_svd = {
    'tfidf__max_features': (None, 10000, 25000, 50000),
    'tfidf__use_idf': (True, False), # Enable inverse-document-frequency reweighting.
    'tfidf__max_df': (0.5, 0.75, 0.9), # ignore terms that have a document frequency strictly higher than the given threshold
    'tfidf__min_df': (0.025, 0.05, 0.1), #  ignore terms that have a document frequency strictly lower than the given threshold
    'tfidf__norm': ('l1', 'l2', None), # regularization term
    'tfidf__smooth_idf': (True, False), # Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once.Prevents zero divisions
    'tfidf__ngram_range': ((1, 1), (1, 2)), # n-grams to be extracted
    'svd__n_components' : (10, 500,1000,2500, 5000, 7500, 10000),
    'svd__algorithm': ("arpack", "randomized"),
    'nml__norm' : ('l1', 'l2', 'max'),
    'clf__alpha': (0.25, 0.5, 0.75),
    'clf__fit_prior': (True, False),      
}  

parameters_cvect_tfidf = {
    'cvect__max_features': (None,1000,?| 5000, 10000),
    'cvect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cvect__max_df': (0.75, 0.9), # ignore terms that have a document frequency strictly higher than the given threshold
    'cvect__min_df': (0.05, 0.1), #  ignore terms that have a document frequency strictly lower than the given threshold
    'tfidf__use_idf': (True, False), # Enable inverse-document-frequency reweighting.
    'tfidf__norm': ('l1', 'l2', None), # regularization term
    'tfidf__smooth_idf': (True, False), # Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once.Prevents zero divisions
    'tfidf__sublinear': (True, False),
    'clf__alpha': (0.25, 0.5, 0.75),
    'clf__fit_prior': (True, False),       
}  

In [0]:
# Read DataFrame
stemmed_df = pd.read_csv("https://raw.githubusercontent.com/WilliamYkZhang/COMP551_A2/master/preprocessed_reddit_train_SnowballStemmer.csv?token=AKKZG4BDT5OWZKKHNN5INRK5WKCPQ")
lemmatized_df = pd.read_csv("https://raw.githubusercontent.com/WilliamYkZhang/COMP551_A2/master/preprocessed_reddit_train_WordNetLemmatizer.csv?token=AKKZG4A66B7WQED7PFYWW525WKCRE")

# Separate X and Y 
X_stem = stemmed_df["cleaned"]
y_stem = stemmed_df["label"]
X_lemma = lemmatized_df["cleaned"]
y_lemma = lemmatized_df["label"]

In [6]:
# Use GridSearch cross validation to find the best features extraction and hyperparameters
gs_CV = GridSearchCV(pipeline_tfidf, param_grid=parameters_tfidf, cv=5)
gs_CV.fit(X_lemma, y_lemma)
print("Performing grid search...")
print("Pipeline: ", [name for name, _ in pipeline_tfidf.steps])
print("Best parameter (CV score={0:.3f}):".format(gs_CV.best_score_))
print("Best parameters set: {} \nBest estimator parameters {}.".format(gs_CV.best_params_, gs_CV.best_estimator_.get_params()))


[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.8s
[Pipeline] ............... (step 2 of 2) Processing clf, total= 1.1min
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.8s
[Pipeline] ............... (step 2 of 2) Processing clf, total= 1.2min
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.8s
[Pipeline] ............... (step 2 of 2) Processing clf, total= 1.2min
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.7s
[Pipeline] ............... (step 2 of 2) Processing clf, total= 1.1min
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.8s
[Pipeline] ............... (step 2 of 2) Processing clf, total= 1.1min
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.8s
[Pipeline] ............... (step 2 of 2) Processing clf, total=  37.1s
[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   1.8s
[Pipeline] ............... (step 2 of 2) Processing clf, total=  38.0s
[Pipel

KeyboardInterrupt: ignored

KeyboardInterrupt: ignored