In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv("../input/preprocessed-with-emoji/preprocessed_with_emoji.csv")

In [None]:
df.head()

In [None]:
# Starting with the CountVectorizer/TfidfTransformer approach...
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
cvec = CountVectorizer(stop_words='english', min_df=.00005, max_df=.5, ngram_range=(1,3))
cvec

In [None]:
# setting up our CountVectorizer
cvec.fit(df.Comment)

In [None]:
len(cvec.vocabulary_)

In [None]:
# transform the document into a “bag of words” representation
cvec_counts = cvec.transform(df.Comment)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

In [None]:
# top 20 most common terms 

occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

In [None]:
# using the TfidfTransformer to calculate the weights for each term in each document 

transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

In [None]:
# top 20 terms by average tf-idf weight:

weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

In [None]:
# using the TfidfVectorizer class
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(min_df=0.00005, max_df=.5, stop_words='english', ngram_range=(1,1))
tvec_weights = tvec.fit_transform(df.Comment)
weights = np.asarray(tvec_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(20)

In [None]:
tvec_weights[0]

In [None]:
X_train = tvec_weights[:10000]
y_train = df["Agg_Level"][:10000]

X_test = tvec_weights[10000:]
y_test = df["Agg_Level"][10000:]

In [None]:
import sklearn
sklearn.preprocessing.normalize(X_train, norm='l2', axis=1, copy=True, return_norm=False)
sklearn.preprocessing.normalize(X_test, norm='l2', axis=1, copy=True, return_norm=False)


In [None]:
# Naive bayes Model
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, y_train)

In [None]:
predicted = clf.predict(X_test)


In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, predicted, normalize=True, sample_weight=None)

In [None]:
# # Support Vector Machine Model
# from sklearn import svm
# clf = svm.SVC().fit(X_train, y_train)


In [None]:
# predicted_svm = clf.predict(X_test)
# np.mean(predicted_svm == y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
predicted_RandomForestClassifier = classifier.predict(X_test)
metrics.accuracy_score(y_test, predicted_RandomForestClassifier, normalize=True, sample_weight=None)

In [None]:
# using xgboost classifier
from xgboost import XGBClassifier
# fit model into training data
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
metrics.accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

In [None]:
import xgboost

In [None]:
# #LightGBM Regressor
# import lightgbm
# from lightgbm import LGBMRegressor
# from lightgbm import LGBMClassifier
# model = LGBMRegressor(boosting_type='gbdt', objective='multiclass',num_class=3,
#                       num_iteration=10000,num_leaves=31,is_enable_sparse='true',
#                       tree_learner='data',min_data_in_leaf=600,max_depth=4, learning_rate=0.1, 
#                       n_estimators=675, max_bin=255, subsample_for_bin=50000, min_split_gain=5, 
#                       min_child_weight=5, min_child_samples=10, subsample=0.995, subsample_freq=1, 
#                       colsample_bytree=1, reg_alpha=.5, reg_lambda=.5, seed=0, nthread=-1, silent=True)

# #Fit to training data
# model.fit(X_train, y_train)
# #Generate Predictions
# y_pred=model.predict(X_test)

In [None]:
# classes = "0,1,2".split(',')
# subm = pd.DataFrame(y_pred, columns=classes)

In [None]:
# y_prediction = []
# for i in range(len(subm)):
#     if max(subm['0'][i], subm['1'][i], subm['2'][i]) == subm['0'][i]:
#         y_prediction.append(0)
#     elif max(subm['0'][i], subm['1'][i], subm['2'][i]) == subm['1'][i]:
#         y_prediction.append(1)
#     elif max(subm['0'][i], subm['1'][i], subm['2'][i]) == subm['2'][i]:
#         y_prediction.append(1)
        
# np.mean(y_prediction == y_test)        

In [None]:
from sklearn.linear_model import LogisticRegression
smreg_model = LogisticRegression(multi_class="multinomial", solver='lbfgs')

smreg_model.fit(X_train, y_train)

predicted_LR = smreg_model.predict(X_test)
metrics.accuracy_score(y_test, predicted_LR, normalize=True, sample_weight=None)

In [None]:
# def cross_validate_xgb(params, x_train, y_train, kf, verbose=True, verbose_eval=50):
#     start_time=time.time()
#     nround=[]
#     # the prediction matrix need to contains 3 columns, one for the probability of each class
#     train_pred = np.zeros((x_train.shape[0],3))
    
#     # use the k-fold object to enumerate indexes for each training and validation fold
#     for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)):
#         x_train_kf, x_val_kf = x_train[train_index, :], x_train[val_index, :]
#         y_train_kf, y_val_kf = y_train[train_index], y_train[val_index]
        
#         d_train = xgboost.DMatrix(x_train_kf, y_train_kf)
#         d_val=xgboost.DMatrix(x_val_kf, y_val_kf)

#         watchlist= [(d_train, "train"), (d_val, 'val')]
#         bst = xgboost.train(params=params, dtrain=d_train, num_boost_round=3000, early_stopping_rounds=100,
#                             evals=watchlist, verbose_eval=verbose_eval)        
        
#         y_val_kf_preds=bst.predict(d_val, ntree_limit=bst.best_ntree_limit)
#         nround.append(bst.best_ntree_limit)
        
#         train_pred[val_index] += y_val_kf_preds
        
#         fold_cv = log_loss(y_val_kf, y_val_kf_preds)
#         if verbose:
#             print('fold cv {} log_loss score is {:.6f}'.format(i, fold_cv))
        
#     cv_score = log_loss(y_train, train_pred)
    
#     if verbose:
#         print('cv log_loss score is {:.6f}'.format(cv_score))    
#         end_time = time.time()
#         print("it takes %.3f seconds to perform cross validation" % (end_time - start_time))
#     return cv_score # for the purpose of bayesian optimisation, we only need to return the CV score

In [None]:
# from sklearn.metrics import log_loss
# from sklearn.model_selection import cross_val_score, cross_val_predict
# from sklearn.model_selection import StratifiedKFold
# from bayes_opt import BayesianOptimization
# import time
# # the bayesian optimisation library throws a lot of warning message, so for readability we disable warning in this notebook.
# # *NOT* encouraged if you want to find out what is going on under the cover :) 
# import warnings
# warnings.filterwarnings("ignore") 

# # Input data files are available in the "../input/" directory.
# # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

# # Any results you write to the current directory are saved as output.

In [None]:
# params={'max_depth':(9,15),
#         'learning_rate':(0.0525,0.0544),
#         'subsample': (0.760, 0.790),
#         'colsample_bytree': (0.4087, 0.4163)
#        }

In [None]:
# # reload(xgb_wrapper)
# def xgbcv_func(max_depth, learning_rate, subsample, colsample_bytree, nthread=4, seed=0):
#     params = {
#         "objective" : "multi:softprob",
#         "num_class" : 3,
#         "tree_method" : "auto",
#         "eval_metric" : "mlogloss",
#         "nthread": nthread,
#         "seed" : 0,
#         'silent': 1,

#         "eta":learning_rate,  # default 0.3
#         "max_depth" : int(max_depth), # default 6
#         "subsample" : subsample, # default 1
#         "colsample_bytree" : colsample_bytree, # default 1
#     }
    
#     # for a more ideal out-of-fold model prediction for this dataset, we use 10-fold CV
#     kf=StratifiedKFold(n_splits=10, shuffle=True, random_state=2017)
    
#     # we will disable all the verbose setting in this functional call, so that we don't have too much information 
#     # to read during the bayesian optimisation process.
#     return 1-cross_validate_xgb(params, X_train, y_train, kf, verbose=False, verbose_eval=False)

In [None]:
# xgb_bo=BayesianOptimization(xgbcv_func, params)

In [None]:
# xgb_bo.maximize(init_points=5, n_iter=50)

In [None]:
#  Value  |   colsample_bytree |   learning_rate |   max_depth |   subsample |
# 0.07329 |             0.4163 |          0.0525 |      9.4621 |      0.7490 |
# 0.07432 |             0.4045 |          0.0522 |      8.7714 |      0.7610 |

In [None]:
xgb_parameters = {
    "objective" : "multi:softprob",
    "num_class" : 3,
    "tree_method" : "auto",
    "eval_metric" : "mlogloss",
    "nthread": 5,
    "seed" : 0,
    'silent': 1,

    "eta":0.0522,  # default 0.3
    "max_depth" : 9, # default 6
    "subsample" : 0.7610, # default 1
    "colsample_bytree" : 0.4045, # default 1
    "gamma": 0.5
}

In [None]:
model = XGBClassifier(**xgb_parameters)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
metrics.accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

In [None]:
model = XGBClassifier(**xgb_parameters)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
np.mean(y_pred == y_train)