### CS4622 Mini Kaggle Competition - Classification
#### Author: Sabine Hollatz

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn import preprocessing, metrics
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
# %tensorflow_version 2.x
import tensorflow as tf

import seaborn as sns



ModuleNotFoundError: No module named 'tensorflow'

### Loading the Data and Preprocessiong

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
df_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/stock_XY_train.csv')
df_train.head()

In [None]:
df_train = df_train.drop(['Unnamed: 0', 'Ticker', 'Yr'], axis=1)
df_train.head()

In [None]:
df_val = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/stock_X_test.csv')
df_val.head()

In [None]:
df_val = df_val.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Ticker', 'Yr'], axis=1)
df_val.head()

In [None]:
df_train.dtypes

In [None]:
df_train['Sector'] = df_train['Sector'].astype('category')

In [None]:
df_train.dtypes

In [None]:
df_train.isna().sum()

In [None]:
# split predictors and response variable
X_train = df_train.drop('Buy', axis=1)
y_train = df_train['Buy']

In [None]:
# get quantitative features to fill missing values with the feature mean. No missing values in qualitative 
# feature 'Sector'.
X_train_quant = X_train.drop('Sector', axis=1)
columns_quant = list(X_train_quant.columns.values)

X_train[columns_quant] = X_train[columns_quant].transform(lambda x: x.fillna(x.mean()))

In [None]:
X_train.isna().sum()

In [None]:
X_train.isnull().sum()

In [None]:
# fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,10))
# ax.boxplot(X_train[columns_quant[:56]])
# # ax[1].boxplot(X_train[columns_quant[56:111]], labels=columns_quant[56:111])
# # ax[2].boxplot(X_train[columns_quant[111:167]], labels=columns_quant[111:167])
# # ax[3].boxplot(X_train[columns_quant[167:]], labels=columns_quant[167:])
# plt.xticks(rotation=90);

In [None]:
def outlierCheck(df, attr):
    outlier_indices = []
    outlier_values = []
    inlier_indices = []
    q1 = df[attr].quantile(.25)
    q3 = df[attr].quantile(.75)
    iqr = q3 - q1
    
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    for index, row in df.iterrows():
        if row[attr] > upper_bound:
            outlier_indices.append(index)
            outlier_values.append(row[attr])
        elif row[attr] < lower_bound:
            outlier_indices.append(index)
            outlier_values.append(row[attr])
        else:
            inlier_indices.append(index)
    return inlier_indices, outlier_indices, outlier_values

In [None]:
# outliers = []
# for feature in columns_quant:
#     inlier, outlier, _ = outlierCheck(X_train, feature)
#     print("In train_X feature {} are {} outliers".format(feature, len(outlier)))
# print("")

In [None]:
# normalize quantitative features
min_max_scaler = preprocessing.MinMaxScaler()
X_train_scaled = X_train.copy(deep=True)

for attr in columns_quant:
    X_train_scaled[attr] = min_max_scaler.fit_transform(X_train_scaled[attr].values.reshape(-1,1))
X_train_scaled.head()

### Training Binary Classifiers

#### Stochastic Gradient Descent classifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=7)
sgd_clf.fit(X_train[columns_quant], y_train)

In [None]:
cross_val_score(sgd_clf, X_train[columns_quant], y_train, cv=5, scoring="accuracy")

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train[columns_quant], y_train, cv=5)
metrics.confusion_matrix(y_train, y_train_pred)

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=7)
sgd_clf.fit(X_train_scaled[columns_quant], y_train)

In [None]:
cross_val_score(sgd_clf, X_train_scaled[columns_quant], y_train, cv=5, scoring="accuracy")

In [None]:
log_reg = linear_model.LogisticRegressionCV(solver='lbfgs', max_iter=10000).fit(X_train_scaled[columns_quant], y_train)
y_hat_log_reg = cross_val_predict(log_reg, X_train_scaled[columns_quant], y_train, cv=5)
metrics.confusion_matrix(y_train, y_hat_log_reg)

In [None]:
cross_val_score(log_reg, X_train_scaled[columns_quant], y_train, cv=5, scoring="accuracy")

In [None]:
tf.__version__

In [None]:
model_01 = tf.keras.Sequential()
model_01.add(tf.keras.layers.Dense(512, activation=tf.nn.relu, input_shape=[221]))
model_01.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
model_01.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
model_01.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
model_01.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
model_01.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
model_01.add(tf.keras.layers.Dense(512, activation=tf.nn.relu))
model_01.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
model_01.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])
model_01.summary()

In [None]:
model_01.fit(X_train[columns_quant], y_train, epochs=100)

In [None]:
y_hat_model_01 = cross_val_predict(model_01, X_train_scaled[columns_quant], y_train, cv=5)
metrics.confusion_matrix(y_train, y_hat_log_reg)

In [None]:
cross_val_score(model_01, X_train_scaled[columns_quant], y_train, cv=5, scoring="accuracy")