<a href="https://colab.research.google.com/github/WilliamYkZhang/COMP551_A2/blob/master/model_selection_log_reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import pandas as pd

# Transformers 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.decomposition import TruncatedSVD

# Models 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Module to write final params 
import csv
import datetime
import pickle 

In [2]:
# Download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
# Get a list of stopwords
stopwords = stopwords.words("english")

# Transformers 
c_vect = CountVectorizer(lowercase=True, encoding="utf-8", decode_error="ignore", strip_accents='unicode',stop_words=stopwords, analyzer = "word")
tfidf_vect = TfidfVectorizer(lowercase=True, encoding = "utf-8",  decode_error = 'ignore', strip_accents='unicode', stop_words=stopwords, analyzer = "word")  
tfidf_trans = TfidfTransformer()
svd = TruncatedSVD()
nml = Normalizer()

# Estimators 
log_reg = LogisticRegression()
svc = SVC() # class weight , experiement values 
xgb_clf = xgb.XGBClassifier(objective='multi:softmax')
decision_tree_clf = DecisionTreeClassifier()
rff = RandomForestClassifier()
multi_NB = MultinomialNB()



In [0]:
# Building pipeline 
pipeline_tfidf = Pipeline([('tfidf', tfidf_vect), ('clf', xgb_clf)], verbose=True)

# Instantiate parameters for pipeline     
parameters_tfidf = {
    'tfidf__max_features': (None, 10000, 25000, 50000),
    'tfidf__use_idf': (True, False), # Enable inverse-document-frequency reweighting.
    'tfidf__max_df': (0.5, 0.75, 0.9), # ignore terms that have a document frequency strictly higher than the given threshold
    'tfidf__min_df': (0.025, 0.05, 0.1), #  ignore terms that have a document frequency strictly lower than the given threshold
    'tfidf__norm': ('l1', 'l2', None), # regularization term
    'tfidf__smooth_idf': (True, False), # Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once.Prevents zero divisions
    'tfidf__ngram_range': ((1, 1), (1, 2)), # n-grams to be extracted     
}  