In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

from scipy.stats import chi2_contingency

import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

TEST_SIZE = 0.2

In [2]:
# Import del dataset e divisione in train e test
train_df = pd.read_csv('datasets/InternetAddiction.csv')
train_df.rename(columns={'sii':'label'}, inplace=True)
train_df = train_df.dropna(subset=['label'])

# Viene diviso il train set in train e validation set
train, test_df = train_test_split(train_df, test_size = TEST_SIZE, random_state = 42)
train_df, val_df = train_test_split(train, test_size = TEST_SIZE, random_state = 42)

train_x = train_df.drop(columns=['label'])
train_y = train_df['label'].astype(int)

val_x = val_df.drop(columns=['label'])
val_y = val_df['label'].astype(int)

test_x = test_df.drop(columns=['label'])
test_y = test_df['label'].astype(int)

In [4]:
def nan_values(dataset):
    dataset = dataset.drop(columns=['id'])
    for col in dataset.columns:
        if dataset[col].isnull().sum() > 0:
            if (dataset[col].dtype == 'object') | (dataset[col].dtype == 'bool'):
                dataset[col].fillna(dataset[col].mode()[0], inplace=True)
            else:
                dataset[col].fillna(dataset[col].mean(), inplace=True)
    return dataset

def fs(dataset, dataset_y):
    str_path = 'objects/features_internet.npy'

    if not os.path.exists(str_path):
        dataset['label'] = dataset_y
        correlation_matrix=dataset.corr()
        features = correlation_matrix['label'][(correlation_matrix['label']>=0.1) | (correlation_matrix['label']<=-0.1)].index
        features = features.drop('label')
        print(correlation_matrix['label'].sort_values(ascending=False))
        print(len(features))
        np.save(str_path, features)
    features = np.load(str_path, allow_pickle=True)
    return dataset[features]

def encode(dataset, dataset_y):
    dataset = nan_values(dataset)
    fill_dict = {col: dataset[col].mean() if dataset[col].dtype != 'object' else dataset[col].mode()[0] for col in dataset.columns}
    cat_cols = dataset.select_dtypes(include='object').columns
    num_cols = dataset.select_dtypes(exclude='object').columns
    print(dataset.dtypes)
    scaler = StandardScaler()
    dataset[num_cols] = scaler.fit_transform(dataset[num_cols])

    encoder = OrdinalEncoder()
    dataset[cat_cols] = encoder.fit_transform(dataset[cat_cols])

    dataset = fs(dataset, dataset_y)

    return dataset.astype(float), scaler, encoder, fill_dict




In [5]:
def preproc_test_val(dataset, scaler, encoder, mean):
    dataset = dataset.drop(columns=['id'])
    for key in mean.keys():
        dataset[key].fillna(mean[key], inplace=True)

    cat_cols = dataset.select_dtypes(include='object').columns
    num_cols = dataset.select_dtypes(exclude='object').columns
    dataset[num_cols] = scaler.transform(dataset[num_cols])
    dataset[cat_cols] = encoder.transform(dataset[cat_cols])

    features = np.load('objects/features_internet.npy', allow_pickle=True)
    dataset = dataset[features]
    dataset = dataset.astype(float)
    return dataset

In [None]:
train_x, scaler, encoder, mean = encode(train_x, train_y)
val_x = preproc_test_val(val_x, scaler, encoder, mean)
test_x = preproc_test_val(test_x, scaler, encoder, mean)

In [None]:
EPOCHS = 1000
BATCH_SIZE = 512
DROPOUT = 0.2
classes = 4

train_y = to_categorical(train_y)
val_y = to_categorical(val_y)
test_y = to_categorical(test_y)

np.random.seed(42)
tf.random.set_seed(42)
def model_fn():
    model = Sequential()

    model.add(Dense(112, activation='softmax', input_dim=train_x.shape[-1]))
    model.add(Dropout(DROPOUT))
    model.add(Dense(272, activation='tanh'))
    model.add(Dropout(DROPOUT))
    model.add(Dense(classes,  kernel_initializer='normal', activation='softmax'))   

    model.summary()
    return model

model = model_fn()

initial_learning_rate = 0.01
final_learning_rate = 0.0001
learning_rate_decay_factor = (final_learning_rate / initial_learning_rate)**(1/100)
steps_per_epoch = int(train_x.shape[0]/BATCH_SIZE)

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=steps_per_epoch,
    decay_rate=learning_rate_decay_factor,
    staircase=True)
model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr_schedule), loss = 'categorical_crossentropy', metrics = ['accuracy'])

early_stopping_cb = keras.callbacks.EarlyStopping(patience=20, min_delta=0.0001, restore_best_weights=True)

history = model.fit(train_x, train_y,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    callbacks=[early_stopping_cb],
                    validation_data=(val_x, val_y))

EPOCHS = len(history.history['loss'])

In [None]:
print(history.history.keys())
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].plot(history.history['val_accuracy'])
axs[0].plot(history.history['accuracy'])
axs[0].legend(['val_accuracy', 'accuracy'])

axs[1].plot(history.history['val_loss'])
axs[1].plot(history.history['loss'])
axs[1].legend(['val_loss', 'loss'])

In [None]:
score = model.evaluate(test_x, test_y)

In [None]:
train_x = train.drop(columns=['label'])
train_y = train['label']

test_x = test_df.drop(columns=['label'])
test_y = test_df['label']

train_y = to_categorical(train_y)
test_y = to_categorical(test_y)

train_x, scaler, encoder, mean = encode(train_x, train_y)
test_x = preproc_test_val(test_x, scaler, encoder, mean)

In [None]:
model = model_fn() 
model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr_schedule), loss = 'categorical_crossentropy', metrics = ['accuracy'])
history = model.fit(train_x, train_y,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE)

score = model.evaluate(test_x, test_y)

In [None]:
score = model.evaluate(test_x, test_y)

In [None]:
train_df = pd.concat([train_x, train['label']], axis=1)
train_df.to_csv('datasets/train_internet.csv', index=False)
test_df = pd.concat([test_x, test_df['label']], axis=1)
test_df.to_csv('datasets/test_internet.csv', index=False)

model.save('models/internet_model')
model.save_weights('models/internet_weights')