In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('status_changes.csv')

In [None]:
data.dropna(subset=['contact_email'], inplace=True)
data['contact_email'] = data['contact_email'].str.split(',', expand=True, n=1)[0]
data['contact_email'] = data['contact_email'].str.split(';', expand=True, n=1)[0]
data['contact_email'] = data['contact_email'].str.lower()
data.drop_duplicates(subset=['contact_email'], inplace=True)
data[['username','hostname']] = data['contact_email'].str.lower().str.split('@', expand=True, n=1)


In [None]:
split = data['group_concat(status_to)'].str.split(',')
# check if the user has ever been a paying customer by checking if it was in 17 or 2
data['free_trial'] = split.apply(lambda x: '17' in x or '2' in x)

data.replace({True: 1, False: 0}, inplace=True)
data.drop(columns=['group_concat(status_to)'], inplace=True)
data.drop(columns=['contact_email'], inplace=True)

del split

In [None]:
# after reviewing the data it is fine to truncate it to 23 characters for sanity
data['username'] = data['username'].str[:23]
data['hostname'] = data['hostname'].str[:23]

# One hot encode the username by mapping each character to the alphabet and special characters as 0 and numbers as 27
data['username_encoded'] = data['username'].apply(lambda x: [ord(char) - 96 if char.isalpha() else 27 if char.isnumeric() else 0 for char in str(x)])
data['hostname_encoded'] = data['hostname'].apply(lambda x: [ord(char) - 96 if char.isalpha() else 27 if char.isnumeric() else 0 for char in str(x)])



In [None]:
X = data
data_length = len(data)

# transform the categorical column lists into columns of their own
hostname_columns = pd.DataFrame(X['hostname_encoded'].values.tolist(), columns=['hostname_encoded_1', 'hostname_encoded_2', 'hostname_encoded_3', 'hostname_encoded_4', 'hostname_encoded_5', 'hostname_encoded_6', 'hostname_encoded_7', 'hostname_encoded_8', 'hostname_encoded_9', 'hostname_encoded_10', 'hostname_encoded_11', 'hostname_encoded_12', 'hostname_encoded_13', 'hostname_encoded_14', 'hostname_encoded_15', 'hostname_encoded_16', 'hostname_encoded_17', 'hostname_encoded_18', 'hostname_encoded_19', 'hostname_encoded_20', 'hostname_encoded_21', 'hostname_encoded_22', 'hostname_encoded_23'])
username_columns = pd.DataFrame(X['username_encoded'].values.tolist(), columns=['username_encoded_1', 'username_encoded_2', 'username_encoded_3', 'username_encoded_4', 'username_encoded_5', 'username_encoded_6', 'username_encoded_7', 'username_encoded_8', 'username_encoded_9', 'username_encoded_10', 'username_encoded_11', 'username_encoded_12', 'username_encoded_13', 'username_encoded_14', 'username_encoded_15', 'username_encoded_16', 'username_encoded_17', 'username_encoded_18', 'username_encoded_19', 'username_encoded_20', 'username_encoded_21', 'username_encoded_22', 'username_encoded_23'])

# merge all the columns together
X = pd.concat([hostname_columns, username_columns], axis=1)
X.replace({np.nan: 0}, inplace=True)
y = data['free_trial']

assert len(X) == data_length

del data_length

In [None]:
del hostname_columns
del username_columns

# scale the data# scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
del scaler

In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(64, input_dim=46, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)

from keras.callbacks import TensorBoard
import time
 
log_dir = f"./logs/{time.time()}"

tensorboard = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=False)
model.fit(X_train, y_train, epochs=100, batch_size=32, callbacks=[tensorboard])

accuracy = model.evaluate(X_test, y_test)[1]
print(f"Test Accuracy: {accuracy}")

In [None]:
data['prediction'] = model.predict(X)

In [None]:
# print the global statistics for data[]['prediction'] and format it to 2 decimal places
print(f"Global Prediction: {data['prediction'].mean():.2f}")

# create a new dataframe with the statistics per hostname and order by the count of hostname
stats = data.groupby(['hostname']).agg({'prediction': ['mean', 'count']})
stats.columns = ['prediction_mean', 'count']
stats.sort_values(by=['count'], ascending=False, inplace=True)

print(stats.head(20).to_string(formatters={'prediction_mean': '{:.2f}'.format}))
print(stats.tail(20).to_string(formatters={'prediction_mean': '{:.2f}'.format}))

# create a subset of the dataframe with the top 100 hostnames
top_100 = stats.head(100).copy()
# sort by prediction_mean
top_100.sort_values(by=['prediction_mean'], ascending=False, inplace=True)
# print the top 10 hostnames and format the prediction_mean to 2 decimal places
print(top_100.head(20).to_string(formatters={'prediction_mean': '{:.2f}'.format}))

