In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

np.random.seed(93)

In [0]:
import sys
sys.path.append('/content/drive/My Drive/Adv_PY/Final_Project')
from util import preprocess_text, shuffle_dataset, split_data

In [0]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
filename = '/content/drive/My Drive/Adv_PY/Final_Project/data.txt'

read_file = df = pd.read_table(filename, sep='\t',header=None,names=['label','msg'])
read_file.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [0]:
# create train, test sets
with open(filename, encoding='utf-8') as f:
  texts = f.read().splitlines()

labels = []
corpus = []
for text in texts:
    label, msg = preprocess_text(text)
    labels.append(label)
    corpus.append(msg)

train, test = split_data(corpus, labels, 0.2)
y_train = np.asarray(train[1]).astype('int32').reshape((-1,1))
y_test = np.asarray(test[1]).astype('int32').reshape((-1,1))

In [0]:
# feture extraction using countvectorizer and tfidfvectorizer

def count_vec(train, test):
    count_vector = CountVectorizer()
    train_vec = count_vector.fit_transform(train)
    test_vec = count_vector.transform(test)
    return train_vec, test_vec


def tfidf_vec(train, test):
    tfidf_vec = TfidfVectorizer()
    train_vec = tfidf_vec.fit_transform(train)
    test_vec = tfidf_vec.transform(test)
    return train_vec, test_vec

In [0]:
# define models

def MN_NB():
    clf = MultinomialNB()

    return clf


def RF():
    parameters1 = {'n_estimators':[n for n in range(50, 300, 50)], 
                   'criterion':["gini", "entropy"], 
                   'max_depth':(None, 4, 8, 12, 16, 20, 24, 50),
                   'min_samples_split': (2, 4, 6, 8, 10, 20, 30),
                   'min_samples_leaf': (16, 4, 12)}

    clf = GridSearchCV(RandomForestClassifier(), 
                       parameters1, 
                       cv=5, 
                       n_jobs=-1, 
                       scoring="accuracy")
    
    return clf


def xgb(X_train, y_train, X_test):
    num_of_runs = 10
    if not os.path.exists('optimization_result.csv'):
        os.mknod('optimization_result.csv')
        
    optimization_output_path = "optimization_result.csv"
    
    hyper_list = []
    n = 1
    for i in range(num_of_runs):
        print(f"Training model {n} out of {num_of_runs}")
        learning_rate = np.random.uniform(0.001, 0.15)
        max_depth = np.random.choice([3, 4, 5, 6])
        n_estimators = np.random.randint(low=50, high=180)
        subsample = min(np.random.uniform(0.6, 1.1), 1.0)
        colsample_bytree = min(np.random.uniform(0.6, 1.1), 1.0)

        params = {'learning_rate': learning_rate,                                   
                  'max_depth': max_depth,                                           
                  'n_estimators': n_estimators,
                  'subsample': subsample,                                           
                  'colsample_bytree': colsample_bytree}                  
        print(params)

        clf = XGBClassifier(learning_rate=learning_rate,
                                 objective='binary:logistic',
                                 random_state=42,
                                 n_jobs=8,
                                 n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 subsample=subsample,
                                 colsample_bytree=colsample_bytree)
        
        clf.fit(X_train, y_train)

        preds = clf.predict(X_test)
        print(classification_report(y_test, preds))
        acc = accuracy_score(y_test, preds)
        params['acc'] = acc

        hyper_list.append(pd.DataFrame(params, index=[0]))
        n = n + 1

    hyper_df = pd.concat(hyper_list)
    hyper_df.sort_values('acc', inplace=True, ascending=False)
    hyper_df.reset_index(drop=True, inplace=True)
    hyper_df.to_csv(optimization_output_path)

    best_clf = XGBClassifier(learning_rate=hyper_df['learning_rate'][0],
                             objective='binary:logistic', 
                             random_state=42,
                             n_jobs=8, 
                             n_estimators=hyper_df['n_estimators'][0], 
                             max_depth=hyper_df['max_depth'][0], 
                             subsample=hyper_df['subsample'][0],
                             colsample_bytree=hyper_df['colsample_bytree'][0])
    
    return best_clf

In [0]:
# fit and evaluate model
def fit_eval(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    print(f"accuracy score: {accuracy_score(y_test, preds)}")
    print("\n=========================\n")

In [0]:
train_count, test_count = count_vec(train[0], test[0])
train_tf, test_tf = tfidf_vec(train[0], test[0])

### Train and evaluate models

In [0]:
naive_bayes = MN_NB()
print("CountVectorizer:")
fit_eval(naive_bayes, train_count, y_train, test_count, y_test)
print("TfidfVectorizer:")
fit_eval(naive_bayes, train_tf, y_train, test_tf, y_test)

CountVectorizer:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       984
           1       0.96      0.96      0.96       130

    accuracy                           0.99      1114
   macro avg       0.98      0.98      0.98      1114
weighted avg       0.99      0.99      0.99      1114

accuracy score: 0.9910233393177738
TfidfVectorizer:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       984
           1       1.00      0.74      0.85       130

    accuracy                           0.97      1114
   macro avg       0.98      0.87      0.92      1114
weighted avg       0.97      0.97      0.97      1114

accuracy score: 0.9694793536804309


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [0]:
random_forest = RF()
print("CountVectorizer:")
fit_eval(random_forest, train_count, y_train, test_count, y_test)
print("TfidfVectorizer:")
fit_eval(random_forest, train_tf, y_train, test_tf, y_test)

CountVectorizer:


  self.best_estimator_.fit(X, y, **fit_params)


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       984
           1       1.00      0.82      0.90       130

    accuracy                           0.98      1114
   macro avg       0.99      0.91      0.94      1114
weighted avg       0.98      0.98      0.98      1114

accuracy score: 0.9784560143626571
TfidfVectorizer:


  self.best_estimator_.fit(X, y, **fit_params)


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       984
           1       1.00      0.82      0.90       130

    accuracy                           0.98      1114
   macro avg       0.99      0.91      0.95      1114
weighted avg       0.98      0.98      0.98      1114

accuracy score: 0.9793536804308797


In [0]:
xgb = xgb(train_count, y_train, test_count)
print("CountVectorizer:")
fit_eval(xgb, train_count, y_train, test_count, y_test)

Training model 1 out of 10
{'learning_rate': 0.03415269705840413, 'max_depth': 3, 'n_estimators': 85, 'subsample': 0.6961938113769657, 'colsample_bytree': 0.8260392830802796}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.95      1.00      0.97       984
           1       0.96      0.58      0.72       130

    accuracy                           0.95      1114
   macro avg       0.95      0.79      0.85      1114
weighted avg       0.95      0.95      0.94      1114

Training model 2 out of 10
{'learning_rate': 0.0841867313434725, 'max_depth': 4, 'n_estimators': 60, 'subsample': 0.8140993803492083, 'colsample_bytree': 0.9015587924211335}
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       984
           1       0.96      0.69      0.80       130

    accuracy                           0.96      1114
   macro avg       0.96      0.84      0.89      1114
weighted avg       0.96      0.96      0.96      1114

Training model 3 out of 10
{'learning_rate': 0.1012357493531761, 'max_depth': 3, 'n_estimators': 175, 'subsample': 0.770782216545549, 'colsample_bytree': 1.0}
             

In [0]:
xgb = xgb(train_tf, y_train, test_tf)
print("CountVectorizer:")
fit_eval(xgb, train_tf, y_train, test_tf, y_test)

Training model 1 out of 10
{'learning_rate': 0.05881570231084641, 'max_depth': 4, 'n_estimators': 174, 'subsample': 1.0, 'colsample_bytree': 0.8419389924711405}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.97      1.00      0.98       984
           1       0.98      0.78      0.87       130

    accuracy                           0.97      1114
   macro avg       0.98      0.89      0.93      1114
weighted avg       0.97      0.97      0.97      1114

Training model 2 out of 10
{'learning_rate': 0.028490552790031938, 'max_depth': 6, 'n_estimators': 167, 'subsample': 0.7656194119285389, 'colsample_bytree': 1.0}
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       984
           1       0.95      0.78      0.86       130

    accuracy                           0.97      1114
   macro avg       0.96      0.89      0.92      1114
weighted avg       0.97      0.97      0.97      1114

Training model 3 out of 10
{'learning_rate': 0.0315608273925203, 'max_depth': 6, 'n_estimators': 179, 'subsample': 0.6429352779375384, 'colsample_bytree': 1.0}
              precision 