In [6]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.utils import class_weight
import tensorflow as tf
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix 
print("TensorFlow version:", tf.__version__)
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from datetime import datetime
from sklearn.metrics import accuracy_score, classification_report
import keras
import gc
import warnings
warnings.filterwarnings('ignore')

TensorFlow version: 2.12.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8491533649543682592
xla_global_id: -1
]


In [7]:
!python --version

Python 3.11.5


In [8]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

RANDOM_SEED = 42

In [9]:
tf.test.is_built_with_cuda()

False

In [10]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print('GPU name: ', tf.config.experimental.list_physical_devices('GPU'))

Num GPUs Available:  0
GPU name:  []


In [11]:
BYTES_TO_MB_DIV = 0.000001
def print_memory_usage_of_data_frame(df):
        mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
        print("Memory usage is " + str(mem) + " MB")

In [12]:
def nan_click_viewed__deleting(X, y = None):
    X = X[X['click'].isna() == False]
    X = X[X['view'] != 0]
    return X 

In [13]:
def dub_dropping(X, y = None):
    X =  X.drop_duplicates()                              
    return X

In [14]:
def paused_status_dropping(X, y = None):
    X.drop(X.loc[X['status'] == 'paused'].index, inplace=True)
    return X

In [15]:
def place_number_decrease(X, y = None):
    X = X[X['place_number'] > 0]
    X['place_number'] = X['place_number'] - 1
    return X

In [16]:
def nan_filling(X, y = None):
    nan_cols = {}
    for col in X.columns:
        nan_count = X[col].isna().sum()
        if nan_count != 0:
            per = np.round((nan_count / df.shape[0]) * 100, 2)
            nan_cols[col] = per
                       
    for col, per in nan_cols.items():
        if per < 5:
            X = X[X[col].isna() == False]
        else:
            X[col] = X[col].fillna('unknown')
            
    return X

In [17]:
df_clickhouse = pd.read_csv('../internship/clickhouse.csv')
df_clickhouse.shape

(5889167, 39)

In [18]:
req_df_columns = df_clickhouse.columns.drop('click')
prep_tools_dict = {'req_df_columns' : req_df_columns}

In [19]:
df_creatives = pd.read_csv('../internship/creatives.csv')
df_creatives.shape

(132, 15)

In [20]:
df_creatives = df_creatives.rename(columns={'id': 'creative_id'})

In [21]:
df = pd.merge(df_clickhouse, df_creatives, on="creative_id")
df.shape

(5889167, 53)

In [22]:
df = paused_status_dropping(df)

In [23]:
df = nan_filling(dub_dropping(nan_click_viewed__deleting(df)))
df.shape

(2797034, 53)

In [24]:
df = place_number_decrease(df)
df.shape

(2793412, 53)

In [25]:
df.head()

Unnamed: 0,auction_date_time,impression_hash,ssp,auction_id,impression_id,bid_id,auction_type,bid_floor,bid_price,loss_reason,is_win,pay_price,is_pay,view,place_number,click,ssp_user_id,creative_id,campaign_id_x,stream_id_x,link_id,format,device,OS,browser,geo_country,geo_city,ip_v4,ip_v6,site_id,tag_id,iab_category_x,ab_test,enter_utm_source,enter_utm_campaign,enter_utm_medium,enter_utm_content,enter_utm_term,event_date_time,status,is_deleted,campaign_id_y,user_id,stream_id_y,theme,second_theme,iab_category_y,image,image_extension,mime_type,image_tag,created_at,updated_at
0,2024-01-10 15:15:34,3595129331408507136,7,cmf8kpg76rmanjkfhlh0,mobicbottomx2002_1,3Y1MGDSF4DZ45TYK3J8BXRZH,fst_price,0.01,107056393,10,1,107056,1,1,1,0,cmf8km876rmanjkfhgn0:cmf8km876rmanjkfhgmg,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,551487,85.249.31.254,unknown,01gm531atx,mobicbottomx2002,[],automl,rtb_24smi,86,bidml,7f73d0f3b178,13926_25077,2024-01-10 15:15:45,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
4,2024-01-15 17:14:51,-3336184151420619993,7,cmijrmo76rmanjm1rqd0,ou9j4n9sqtfe8400_0,3YEQAEAH9SCKP3FHT4DH38SE,fst_price,0.01,172606393,6,1,172606,1,1,0,0,cmijrkg76rmanjm1rmqg:cmijrkg76rmanjm1rmq0,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,524901,188.94.32.123,unknown,01gm531atx,ou9j4n9sqtfe8400,[],automl,rtb_24smi,86,bidml,8d65123ad67e,15314_15024,2024-01-15 17:15:25,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
8,2024-01-14 13:50:40,-4845328304293519401,7,cmhrp0076rmanjhro4cg,my0fq2bg0twts35c_3,3YBS7VCA4NRNFNQ7HXA8DC89,fst_price,0.01,55438822,2,1,55438,1,1,3,0,cmhrp0076rmanjhro45g:cmhrp0076rmanjhro450,6b6df5b1f938,77,111,4362,native,2,Win10,Yandex Browser,RU,581049,95.53.35.121,unknown,01hew0ev25,my0fq2bg0twts35c,[],automl,rtb_24smi,86,bidml,04376a608224,30347_27769,2024-01-14 13:50:42,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
9,2024-01-18 12:20:31,1927312752573112664,7,cmkeqno76rmanjnmjdgg,mobicbottomx2002_1,3YNXNNJC7HM8GFKZXE92T412,fst_price,0.01,76715517,10,1,76715,1,1,1,0,cmkeqi876rmanjnmj5u0:cmkeqi876rmanjnmj5tg,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,2022890,80.83.239.41,unknown,01gm531atx,mobicbottomx2002,[],automl,rtb_24smi,86,bidml,7f73d0f3b178,13926_25077,2024-01-18 12:21:00,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02
10,2024-01-10 01:18:04,-4294347528276567097,7,cmesc7076rmanjikaivg,ou9j4n9sqtfe8400_1,3Y04JY55N0MZ3C2BXAES4C2W,fst_price,0.01,79799215,2,1,79799,1,1,1,1,cmesc5g76rmanjikafdg:cmesc5g76rmanjikafd0,6b6df5b1f938,77,111,4362,native,4,Android,Chrome,RU,498817,31.134.189.196,unknown,01gm531atx,ou9j4n9sqtfe8400,[],automl,rtb_24smi,86,bidml,9b209a09a68a,13926_25077,2024-01-10 01:25:11,active,0,77,9,111,default,default,IAB7,b22f3655f7be6cd9d37e9b44,jpg,image/jpeg,default,2023-10-11 12:32:35,2023-11-08 13:23:02


In [26]:
df_train, df_test = train_test_split(df, stratify = df['click'], test_size=0.2, random_state=RANDOM_SEED)

X_train = df_train.drop('click', axis = 1)
y_train = df_train['click']

X_test = df_test.drop('click', axis = 1)
y_test = df_test['click']

In [27]:
X_train = df.drop('click', axis = 1)
y_train = df['click']

In [28]:
def column_deleting(X, y = None):
    columns_for_drop = ['creative_id', 'auction_date_time', 'impression_hash', 'ssp', 'auction_id', 'impression_id', 'bid_id', 'auction_type', 'bid_floor', 'bid_price', 'is_win', 'pay_price', 'is_pay', 'view', 'ssp_user_id', 'campaign_id_x', 'stream_id_x', 'link_id', 'format', 'ip_v4', 'ip_v6', 'site_id', 'iab_category_x', 'ab_test', 'enter_utm_campaign', 'status', 'is_deleted', 'campaign_id_y', 'user_id', 'stream_id_y', 'image', 'created_at', 'updated_at']
    X = X.drop(columns_for_drop, axis = 1)
    X.reset_index(drop=True, inplace=True)
    return X

In [29]:
def feature_reduction(X, y = None):
    X['browser'] = X['browser'].apply(lambda x : 'Chrome' if x == 'Google' else x)
    X['browser'] = X['browser'].apply(lambda x : 'Yandex Browser' if x == 'Yandex' else x)
    X.reset_index(drop=True, inplace=True)
    return X

In [30]:
def date_time_col_transformation(X, y = None):
    X['req_date_time'] = pd.to_datetime(X['event_date_time'], format='%Y-%m-%d %H:%M:%S')
    X['day'] =  X['req_date_time'].dt.day
    X['month'] =  X['req_date_time'].dt.month
    X['year'] =  X['req_date_time'].dt.year
    X['hour'] =  X['req_date_time'].dt.hour
    X['day_of_week'] = X['event_date_time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").weekday())
    X['is_holyday'] = X['day_of_week'].apply(lambda x : 1 if x in [5 , 6] else 0)
    X['is_friday'] = X['day_of_week'].apply(lambda x : 1 if x == 4 else 0)
    X['is_monday'] = X['day_of_week'].apply(lambda x : 1 if x == 0 else 0)
    X['day_part'] = X['hour'].apply(lambda x : 'Night' if x >= 22 and x < 5 else ('Morning' if x >= 5 and x < 12 else ('Day' if x >= 12 and x < 17 else 'Evening')))
    X['year_part'] = X['month'].apply(lambda x : 'Winter' if x in [12, 1, 2] else ('Spring' if x in [3, 4, 5] else ('Summer' if x in [6, 7, 8] else 'Autumn')))
    X = X.drop(['event_date_time', 'req_date_time', 'year'], axis = 1)
    X.reset_index(drop=True, inplace=True)
    return X

In [31]:
preprocessor = Pipeline(
    steps=
    [
        ('column_deleting', FunctionTransformer(func=column_deleting)),
        ('feature_reduction', FunctionTransformer(func=feature_reduction)),
        ('date_time_col_transformation', FunctionTransformer(func=date_time_col_transformation)),
        #('onehot', OneHotEncoder(sparse = True, handle_unknown='ignore'))
        
    ]
)

In [32]:
X_train_prep = preprocessor.fit_transform(X_train).A
X_train_prep.shape

AttributeError: 'DataFrame' object has no attribute 'A'

In [None]:
X_train_prep.head()

In [25]:
y_train.shape

(2793412,)

In [26]:
prep_tools_dict['preprocessor'] = preprocessor

In [27]:
X_test_prep = preprocessor.transform(X_test).A
#X_test_prep = preprocessor.transform(X_test)
X_test_prep.shape

NameError: name 'X_test' is not defined

In [44]:
y_test.shape

(558683,)

In [106]:
class_weights = class_weight.compute_class_weight(
    class_weight = "balanced",
                                        classes = np.unique(y_train),
                                        y = y_train  
   )
class_weights = dict(enumerate(class_weights))

In [29]:
prep_tools_dict['model_metadata'] = 'Модель: tf, ROC_AUC: 0.6238609122646263'

In [30]:
import dill

In [31]:
with open('../pickles/prep_tools_dict_keras.pkl', 'wb') as f:
    dill.Pickler(f, recurse=True).dump(prep_tools_dict)

In [32]:
from tensorflow.keras.models import Sequential, model_from_yaml
from tensorflow.keras.layers import Dense

In [33]:
n_features_in_ = 100
X_shape_ = X_train_prep.shape
n_classes_ = len(y_train.unique())

In [34]:
model = Sequential()
model.add(Dense(n_features_in_, input_dim=X_shape_[1], activation='relu'))
model.add(Dense(128, activation='relu'))
#model.add(Dense(n_classes_, activation='softmax'))
model.add(Dense(n_classes_, activation='sigmoid'))

In [35]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

In [36]:
model.compile(loss=loss_fn, optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy']) #tf.keras.metrics.AUC

# Fit the model
hist = model.fit(X_train_prep, y_train,  epochs=10, batch_size=1024, verbose=0) #validation_data=(X_test, y_test),

In [56]:
predict_prob=model.predict([X_test_prep])

predict_classes=np.argmax(predict_prob, axis=1)



In [57]:
predict_prob

array([[0.8877628 , 0.06889197],
       [0.840201  , 0.11178938],
       [0.83560264, 0.12438573],
       ...,
       [0.893299  , 0.06780455],
       [0.95242804, 0.02863356],
       [0.90292054, 0.08608793]], dtype=float32)

In [28]:
0.8877628 + 0.06889197

0.95665477

In [58]:
predict_classes

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [59]:
def get_predict_prob(number):
  return [number[0],1-number[0]]

y_prob = np.array(list(map(get_predict_prob, predict_prob)))
y_prob

array([[0.88776278, 0.11223722],
       [0.84020102, 0.15979898],
       [0.83560264, 0.16439736],
       ...,
       [0.89329898, 0.10670102],
       [0.95242804, 0.04757196],
       [0.90292054, 0.09707946]])

In [146]:
roc_auc_test = roc_auc_score(y_test, predict_prob[:, 1])
        
print(f'Модель: {type(model).__name__}\nROC_AUC: {roc_auc_test}\n')

Модель: Sequential
ROC_AUC: 0.6238609122646263



In [161]:
print("Classification Report:\n", classification_report(y_test, predict_classes))

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98    542785
           1       0.08      0.01      0.01     15898

    accuracy                           0.97    558683
   macro avg       0.53      0.50      0.50    558683
weighted avg       0.95      0.97      0.96    558683



In [128]:
print(hist.history) 

{'loss': [0.12604697048664093, 0.1240595206618309, 0.12347030639648438, 0.12287656962871552, 0.1222110167145729, 0.12154526263475418, 0.12081686407327652, 0.12010344862937927, 0.1193896234035492, 0.11861502379179001, 0.11794382333755493, 0.11728426814079285, 0.11669963598251343, 0.11613663285970688, 0.11565133929252625, 0.11512080579996109, 0.11470918357372284, 0.11424527317285538, 0.11387033015489578, 0.11345171183347702, 0.11313983798027039, 0.1128397285938263, 0.11245504021644592, 0.11216623336076736, 0.11188262701034546, 0.11159098148345947, 0.11135032773017883, 0.1110314279794693, 0.11079353094100952, 0.11059063673019409, 0.11031700670719147, 0.11010891199111938, 0.10992804169654846, 0.10966244339942932, 0.10946425795555115, 0.10928695648908615, 0.10909567028284073, 0.1089644804596901, 0.1087542399764061, 0.10860811918973923, 0.10838786512613297, 0.10825825482606888, 0.10816805064678192, 0.10802344977855682, 0.10783310979604721, 0.10770755261182785, 0.10758669674396515, 0.10744882

In [129]:
best_score = max(hist.history['accuracy'])

print(best_score)

0.9735302329063416


In [37]:
# serialize model to JSON
model_json = model.to_json()
with open("../pickles/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../pickles/model.h5")
print("Saved model to disk")

Saved model to disk
