In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('status_changes.csv')

In [None]:

data['contact_email'] = data['contact_email'].str.split(',', expand=True, n=1)[0].str.lower()
data.drop_duplicates(inplace=True)
data[['username','hostname']] = data['contact_email'].str.lower().str.split('@', expand=True, n=1)
data[['domain', 'tld']] = data['hostname'].str.split('.', expand=True, n=1)

In [None]:
split = data['group_concat(status_to)'].str.split(',')
data['free_trial'] = split.apply(lambda x: '17' in x)
data['client'] = split.apply(lambda x: '2' in x)
domain_freqs = data['hostname'].value_counts()
username_freqs = data['username'].value_counts()
data['hostname_unique'] = data['hostname'].map(domain_freqs) < 10
data['hostname_length'] = data['hostname'].str.len()
data['username_unique'] = data['username'].map(username_freqs) < 10
data['username_length'] = data['username'].str.len()
#data['username_non_alpha'] = data['username'].apply(lambda x: sum(1 for char in str(x) if not char.isalnum()))

data.replace({True: 1, False: 0}, inplace=True)

del split
del domain_freqs
del username_freqs

In [None]:
# One hot encode the username by mapping each character to the alphabet and special characters as 30
data['username_encoded'] = data['username'].apply(lambda x: [ord(char) - 96 if char.isalpha() else 30 for char in str(x)])

# One hot encode the hostname by mapping each character to the alphabet and special characters as 30
data['hostname_encoded'] = data['hostname'].apply(lambda x: [ord(char) - 96 if char.isalpha() else 30 for char in str(x)])

# One hot encode the tld by mapping each character to the alphabet and special characters as 30
data['tld_encoded'] = data['tld'].apply(lambda x: [ord(char) - 96 if char.isalpha() else 30 for char in str(x)])

# make sure all lists are the same length and not longer than 20
data['username_encoded'] = data['username_encoded'].apply(lambda x: x[:20] + [0]*(20-len(x)))
data['hostname_encoded'] = data['hostname_encoded'].apply(lambda x: x[:20] + [0]*(20-len(x)))

# make sure the tld is exactly 3 characters long
data['tld_encoded'] = data['tld_encoded'].apply(lambda x: x[:3] + [0]*(3-len(x)))

In [None]:
X = data

# scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

numeric_columns = X.drop(['hostname_encoded', 'username_encoded', 'tld_encoded'], axis=1)
categorical_columns = X[['hostname_encoded', 'username_encoded', 'tld_encoded']]

# transform the categorical column lists into columns of their own
hostname_columns = pd.DataFrame(categorical_columns['hostname_encoded'].values.tolist(), columns=['hostname_encoded_1', 'hostname_encoded_2', 'hostname_encoded_3', 'hostname_encoded_4', 'hostname_encoded_5', 'hostname_encoded_6', 'hostname_encoded_7', 'hostname_encoded_8', 'hostname_encoded_9', 'hostname_encoded_10', 'hostname_encoded_11', 'hostname_encoded_12', 'hostname_encoded_13', 'hostname_encoded_14', 'hostname_encoded_15', 'hostname_encoded_16', 'hostname_encoded_17', 'hostname_encoded_18', 'hostname_encoded_19', 'hostname_encoded_20'])
username_columns = pd.DataFrame(categorical_columns['username_encoded'].values.tolist(), columns=['username_encoded_1', 'username_encoded_2', 'username_encoded_3', 'username_encoded_4', 'username_encoded_5', 'username_encoded_6', 'username_encoded_7', 'username_encoded_8', 'username_encoded_9', 'username_encoded_10', 'username_encoded_11', 'username_encoded_12', 'username_encoded_13', 'username_encoded_14', 'username_encoded_15', 'username_encoded_16', 'username_encoded_17', 'username_encoded_18', 'username_encoded_19', 'username_encoded_20'])
tld_columns = pd.DataFrame(categorical_columns['tld_encoded'].values.tolist(), columns=['tld_encoded_1', 'tld_encoded_2', 'tld_encoded_3'])

# merge all the columns together
X = pd.concat([numeric_columns, hostname_columns, username_columns, tld_columns], axis=1)
X.dropna(inplace=True)
y = X['free_trial']
X.drop(['username','hostname','domain','tld','contact_email', 'group_concat(status_to)', 'free_trial', 'client'], axis=1, inplace=True)

In [None]:

del numeric_columns
del categorical_columns
del hostname_columns
del username_columns
del tld_columns

# scale the data
X = scaler.fit_transform(X)

del scaler

In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(32, input_dim=47, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
# convert X_train and X_test to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)

# setup to use tensorboard
from keras.callbacks import TensorBoard
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=1, write_graph=True, write_images=False)
model.fit(X_train, y_train, epochs=10, batch_size=10, callbacks=[tensorboard])

accuracy = model.evaluate(X_test, y_test)[1]
print(f"Test Accuracy: {accuracy}")