In [32]:
# reset variables
from IPython import get_ipython
get_ipython().magic('reset -sf') 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing # To get MinMax Scaler function

# enable multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [33]:
# test out various models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

In [34]:
# Read data file
df_train = pd.read_csv('./datasets/train_extracted.csv'
                 , sep=',', encoding='utf-8')

df_test = pd.read_csv('./datasets/test_extracted.csv'
                 , sep=',', encoding='utf-8')

df_valid = pd.read_csv('./datasets/valid_extracted.csv'
                 , sep=',', encoding='utf-8')

#just in case of NaN values
df_train = df_train.dropna();
df_test = df_test.dropna();
df_valid = df_valid.dropna();

#data = pd.read_csv("../input/data.csv", header=0)
df = df_train
seed = 5
np.random.seed(seed)

In [35]:
# change label into binary classification
# pick out target data
df_train.label.loc[(df_train['label'] >= 0.5)] = 1;
df_train.label.loc[(df_train['label'] < 0.5)] = 0;
df_train_target = df_train['label']

df_test.label.loc[(df_test['label'] >= 0.5)] = 1;
df_test.label.loc[(df_test['label'] < 0.5)] = 0;
df_test_target = df_test['label']

df_valid.label.loc[(df_valid['label'] >= 0.5)] = 1;
df_valid.label.loc[(df_valid['label'] < 0.5)] = 0;
df_valid_target = df_valid['label']

In [36]:
# Select the columns to use, generated from feature-selection-wrapper forward selection method
prediction_feature = ['num_-', 'num_"', 'num_$', 'count_uppercase', 'count_pronoun', 'count_article'
                  , 'count_discrep', 'count_affiliation', 'count_focuspast', 'count_money']
df_train_feature = df_train[prediction_feature]
df_valid_feature = df_valid[prediction_feature]
df_test_feature = df_test[prediction_feature]

In [37]:
# Initializing the MinMaxScaler function
min_max_scaler = preprocessing.MinMaxScaler()
#standard_scaler = preprocessing.StandardScaler()

# Scaling dataset keeping the columns name
df_train_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_train_feature), columns = df_train_feature.columns)
df_valid_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_valid_feature), columns = df_valid_feature.columns)
df_test_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_test_feature), columns = df_test_feature.columns)
#X_scaled = pd.DataFrame(standard_scaler.fit_transform(X), columns = X.columns)

In [38]:
models = []

models.append(("LogisticRegression",LogisticRegression()))
models.append(("SVC",SVC()))
models.append(("LinearSVC",LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
rf2 = RandomForestClassifier(n_estimators=100, criterion='gini',
                                max_depth=10, random_state=0, max_features=None)
models.append(("RandomForest2",rf2))
models.append(("MLPClassifier",MLPClassifier(solver='lbfgs', random_state=0)))

In [39]:
results = []
names = []
for name,model in models:
    result = cross_val_score(model, df_train_feature_scaled, df_train_target,  cv=3)
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())

LogisticRegression 0.5615754194132748
SVC 0.5619672881457146
LinearSVC 0.5620651545543526
KNeighbors 0.5278741325693361
DecisionTree 0.5328696959012258
RandomForest 0.5388457372599905
RandomForest2 0.5503081424218547
MLPClassifier 0.5636320536299858
