In [102]:
import time
import numpy as np
import pandas as pd
import warnings
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import text
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn import tree, svm, naive_bayes,neighbors
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [103]:
### Models

# Neural Network
# MLP = MLPClassifier(solver='adam',activation = 'relu', alpha=0.001, tol=0.002, max_iter = 200, hidden_layer_sizes = (100,10),random_state = 1,verbose = True)
MLP = MLPClassifier(solver='adam',activation = 'logistic', tol=0.015, hidden_layer_sizes = (70,), random_state = 1,verbose = True)

# Support Vector Machine
LSVC = svm.LinearSVC(dual = False)

# Gaussian Naive Bayes 
GNB = naive_bayes.GaussianNB()

# Multinomial Naive Bayes
MNB = naive_bayes.MultinomialNB()

# Logistc Regression
LRC = LogisticRegression(multi_class='multinomial')

# K-NearestNeighbor
KNN = neighbors.KNeighborsClassifier(weights='distance')

# Decision Tree
DTC = tree.DecisionTreeClassifier()

# Randon Forest
RDTC = RandomForestClassifier(n_estimators=50)

# Adaboost
ADA = AdaBoostClassifier(n_estimators=50)

# Gradient Boosting Regressor
GBRT = GradientBoostingClassifier(n_estimators=50, learning_rate=1.0,max_depth=1, random_state=0)

# Bagging Decision Tree
BDTC = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5,max_features=0.5)

In [104]:
### Data Preprocessing
def data_preprocess(train_file, test_data):
    # load
    data_train = np.array(pd.read_csv(train_file))
    data_test = np.array(pd.read_csv(test_data))
    X_train = data_train[:,0]
    y_train = data_train[:,1]
    X_test = data_test[:,1]

    # Vectorizer and Normalizer
    vectorizer = text.TfidfVectorizer(max_features = 20000, binary = False, stop_words = text.ENGLISH_STOP_WORDS)
    normalizer_train = preprocessing.Normalizer()
    
    vectors_train = vectorizer.fit_transform(X_train)
    vectors_test = vectorizer.transform(X_test)
    vectors_train = normalizer_train.transform(vectors_train).A
    vectors_test = normalizer_train.transform(vectors_test).A

    return vectors_train, y_train, vectors_test

In [105]:
def main(model, train_file, test_data, numFolds = 10):
    start = time.time()

    print(model)

    # data preprocessing
    X_train, y_train, X_test = data_preprocess(train_file, test_data)

    # K-fold Cross Validation
    scores = cross_val_score(model, X_train, y_train, cv=numFolds, scoring='accuracy')
    Average_accuracy = scores.mean()
    print('\nThe average accuracy of {}-fold cross validation is: {:.5f}'.format(numFolds, Average_accuracy))

    # train with the whole training dataset
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    print('\nPerformance metrics:\n', metrics.classification_report(y_train, y_train_pred))

    # predict with test dataset
    y_test_pred = model.predict(X_test)
    print('\nPredicted labels of the test dataset:\n', y_test_pred)
    
    # Run time
    end = time.time()
    run_time = end - start
    print('\nRun time: {:.2f}s\n'.format(run_time))

    return y_test_pred

In [None]:
Test_pred = [main(model, 'train.csv', 'test.csv', 10) for model in [MLP]]

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(70,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.015, validation_fraction=0.1, verbose=True,
              warm_start=False)
Iteration 1, loss = 1.99026767
Iteration 2, loss = 1.87940739
Iteration 3, loss = 1.79450551
Iteration 4, loss = 1.67974020
Iteration 5, loss = 1.52881570
Iteration 6, loss = 1.35088668
Iteration 7, loss = 1.16670612
Iteration 8, loss = 0.99724847
Iteration 9, loss = 0.85256788
Iteration 10, loss = 0.73324030
Iteration 11, loss = 0.63494955
Iteration 12, loss = 0.55431074
Iteration 13, loss = 0.48650422
Iteration 14, loss = 0.42897034
Iteration 15, loss = 0.37962112
Itera

In [96]:
# Save to CSV
id = np.arange(len(Test_pred[0]))
Test_pred = np.vstack([id, np.array(Test_pred)]).T
save = pd.DataFrame(Test_pred) 
save.to_csv("Test_pred.csv", header = ['id', 'subreddit'], index=False)