## Click-through rate Prediction
#### Predict whether a mobile ad will be clicked

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

pd.options.display.max_rows = None
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

In [None]:
temp_data = pd.DataFrame()
path = r'datasets\Click-Through Rate Prediction.gz'
chunksize = 50000

for chunk in pd.read_csv(path, compression='gzip', chunksize=chunksize):
    temp_data = pd.concat([temp_data, chunk], ignore_index=True)
    if temp_data.shape[0] >=10000000:
        temp_data.to_csv("datasets/ctr-prediction_1000K.gz", header=True, index=False)
        break

print(temp_data.shape)

In [25]:
data = pd.read_csv(r"datasets/ctr-prediction_1000K.gz")
data.shape

(1000000, 24)

In [26]:
data['click'].value_counts()

0    839781
1    160219
Name: click, dtype: int64

In [27]:
# To get the balanced data from click features:

df = pd.concat([data[data['click']==0].sample(n=150000, random_state=42), 
                data[data['click']==1].sample(n=150000, random_state=42)], ignore_index=True)

print(df['click'].value_counts())

0    150000
1    150000
Name: click, dtype: int64


In [28]:
print(df.columns)
print("------------------------------------------------------")
print(df.dtypes)
print("------------------------------------------------------")
print(df.shape)

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')
------------------------------------------------------
id                  float64
click                 int64
hour                  int64
C1                    int64
banner_pos            int64
site_id              object
site_domain          object
site_category        object
app_id               object
app_domain           object
app_category         object
device_id            object
device_ip            object
device_model         object
device_type           int64
device_conn_type      int64
C14                   int64
C15                   int64
C16                   int64
C17                   int64
C18                   int64
C19                   int64
C20                

In [29]:
def detail_info(data):
    temp_df = pd.DataFrame(index= data.columns)
    
    temp_df['data_type'] = data.dtypes
    temp_df['unique_val'] = data.nunique()
    temp_df['duplicate_val'] = data.duplicated().sum()
    temp_df['missing_val'] = data.isnull().sum()
    temp_df['missing_val_%'] = round(data.isnull().mean()*100,2)
    
    return temp_df

detail_info(df)

Unnamed: 0,data_type,unique_val,duplicate_val,missing_val,missing_val_%
id,float64,300000,0,0,0.0
click,int64,2,0,0,0.0
hour,int64,6,0,0,0.0
C1,int64,7,0,0,0.0
banner_pos,int64,7,0,0,0.0
site_id,object,1554,0,0,0.0
site_domain,object,1388,0,0,0.0
site_category,object,20,0,0,0.0
app_id,object,1622,0,0,0.0
app_domain,object,111,0,0,0.0


In [30]:
df['hour'] = pd.to_datetime(df['hour'].astype(str), format='%y%m%d%H', errors='coerce')
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.566319e+19,0,2014-10-21 03:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,95b5e741,c6824def,cef3e649,a99f214a,72e604b9,894fb406,1,0,21611,320,50,2480,3,299,100111,61
1,1.160269e+19,0,2014-10-21 02:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,45aff1a2,5c620f04,d1327cf5,8aa3f9a9,8bcedcdc,7ac6007f,1,2,21647,320,50,2487,1,547,-1,51
2,8.972551e+18,0,2014-10-21 04:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,4b94f1c1,2347f47a,f95efa07,a99f214a,2d249883,77663e88,1,0,21665,320,50,2493,3,35,100160,117
3,3.866849e+18,0,2014-10-21 04:00:00,1005,1,5b4d2eda,16a36ef3,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,f8210e0c,8a4875bd,1,0,19950,320,50,1800,3,167,100075,23
4,1.841741e+19,0,2014-10-21 04:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,a7e9e8e3,4ea23a13,1,0,15703,320,50,1722,0,35,-1,79


In [31]:
# Lets find the day is weekday or weekend:

df['is_weekend'] = pd.to_datetime(df['hour']).dt.day_name()
df['is_weekend'] = df['is_weekend'].apply(lambda x: 1 if (x == "Saturday" or x == "Sunday") else 0)
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,is_weekend
0,1.566319e+19,0,2014-10-21 03:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,95b5e741,c6824def,cef3e649,a99f214a,72e604b9,894fb406,1,0,21611,320,50,2480,3,299,100111,61,0
1,1.160269e+19,0,2014-10-21 02:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,45aff1a2,5c620f04,d1327cf5,8aa3f9a9,8bcedcdc,7ac6007f,1,2,21647,320,50,2487,1,547,-1,51,0
2,8.972551e+18,0,2014-10-21 04:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,4b94f1c1,2347f47a,f95efa07,a99f214a,2d249883,77663e88,1,0,21665,320,50,2493,3,35,100160,117,0
3,3.866849e+18,0,2014-10-21 04:00:00,1005,1,5b4d2eda,16a36ef3,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,f8210e0c,8a4875bd,1,0,19950,320,50,1800,3,167,100075,23,0
4,1.841741e+19,0,2014-10-21 04:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,a7e9e8e3,4ea23a13,1,0,15703,320,50,1722,0,35,-1,79,0


In [32]:
df['hour_of_day'] = df['hour'].dt.hour
df['time_of_day'] = pd.cut(df['hour_of_day'], bins=[0, 6, 12, 18, 24], labels=['night', 'morning', 'afternoon', 'evening'])
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,is_weekend,hour_of_day,time_of_day
0,1.566319e+19,0,2014-10-21 03:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,95b5e741,c6824def,cef3e649,a99f214a,72e604b9,894fb406,1,0,21611,320,50,2480,3,299,100111,61,0,3,night
1,1.160269e+19,0,2014-10-21 02:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,45aff1a2,5c620f04,d1327cf5,8aa3f9a9,8bcedcdc,7ac6007f,1,2,21647,320,50,2487,1,547,-1,51,0,2,night
2,8.972551e+18,0,2014-10-21 04:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,4b94f1c1,2347f47a,f95efa07,a99f214a,2d249883,77663e88,1,0,21665,320,50,2493,3,35,100160,117,0,4,night
3,3.866849e+18,0,2014-10-21 04:00:00,1005,1,5b4d2eda,16a36ef3,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,f8210e0c,8a4875bd,1,0,19950,320,50,1800,3,167,100075,23,0,4,night
4,1.841741e+19,0,2014-10-21 04:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,a7e9e8e3,4ea23a13,1,0,15703,320,50,1722,0,35,-1,79,0,4,night


In [33]:
for feat in df.columns:
    if df[feat].nunique()>=200:
        df.drop([feat], axis=1, inplace=True)  

In [34]:
detail_info(df)

Unnamed: 0,data_type,unique_val,duplicate_val,missing_val,missing_val_%
click,int64,2,277918,0,0.0
hour,datetime64[ns],6,277918,0,0.0
C1,int64,7,277918,0,0.0
banner_pos,int64,7,277918,0,0.0
site_category,object,20,277918,0,0.0
app_domain,object,111,277918,0,0.0
app_category,object,23,277918,0,0.0
device_type,int64,4,277918,0,0.0
device_conn_type,int64,4,277918,0,0.0
C15,int64,8,277918,0,0.0


In [35]:
label_encoded_features = ['C1', 'site_category', 'app_category',  'C15', 'C16', 'time_of_day' ]

onehot_encoded_features = ['app_domain', 'app_category', 'C17','C19', 'C20', 'C21']

label_encoder = LabelEncoder()
df[label_encoded_features] = df[label_encoded_features].apply(label_encoder.fit_transform)
df.head()

onehot_encoder = OneHotEncoder(sparse=False, drop='first')
onehot_encoded = onehot_encoder.fit_transform(df[onehot_encoded_features])

onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names(onehot_encoded_features))
df.drop(onehot_encoded_features, axis=1, inplace=True)
df = pd.concat([df, onehot_encoded_df], axis=1)

df.drop(['hour','hour_of_day'], axis=1, inplace=True)

df.head()

Unnamed: 0,click,C1,banner_pos,site_category,device_type,device_conn_type,C15,C16,C18,is_weekend,time_of_day,app_domain_0654b444,app_domain_0b7d3d7d,app_domain_0d79ee56,app_domain_0e8616ad,app_domain_1438d51f,app_domain_15c23f8e,app_domain_15ec7f39,app_domain_18eb4e75,app_domain_1cb641ec,app_domain_1cbecd39,app_domain_1d5e09f4,app_domain_1ddc989f,app_domain_1ea19ec4,app_domain_1ed56ded,app_domain_2022d54e,app_domain_20ab8b07,app_domain_2347f47a,app_domain_24f896e0,app_domain_27ee373d,app_domain_298309ba,app_domain_2b627705,app_domain_2c1c31c6,app_domain_323f3fe5,app_domain_337b74ad,app_domain_33da2e74,app_domain_3feeed1e,app_domain_43cf4f06,app_domain_44324ff4,app_domain_449e219f,app_domain_45a51db4,app_domain_47464e95,app_domain_47db8711,app_domain_48aec236,app_domain_4e007635,app_domain_4fafd4c4,app_domain_52d64e90,app_domain_55240cf0,app_domain_56eabb45,app_domain_5ac0b939,app_domain_5b9c592b,app_domain_5bcedd7d,app_domain_5c5a694b,app_domain_5c620f04,app_domain_5daf29b2,app_domain_63f57be0,app_domain_64ae80a5,app_domain_6a0a3a9d,app_domain_6a90b0cb,app_domain_6bfb9168,app_domain_6cf43c3b,app_domain_6f7ca2ba,app_domain_700adbf0,app_domain_713c0c91,app_domain_73fc6786,app_domain_7801e8d9,app_domain_7b833eb9,app_domain_7bbb38df,app_domain_800100e0,app_domain_813f3323,app_domain_828da833,app_domain_82e27996,app_domain_86aa8fec,app_domain_88293ffa,app_domain_885c7f3f,app_domain_8d87821d,app_domain_90706f5d,app_domain_916026d9,app_domain_97efe5c6,app_domain_9830a8fb,app_domain_999f1fac,app_domain_9acfe436,app_domain_9ec164d3,app_domain_9ecca2dd,app_domain_a271c340,app_domain_ad63ec9b,app_domain_ae637522,app_domain_aefc06bd,app_domain_af201489,app_domain_afdf1f54,app_domain_b0920d40,app_domain_b12ff13e,app_domain_b1ab9955,app_domain_b2816726,app_domain_b398ab59,app_domain_b408d42a,app_domain_b5f3b24a,app_domain_b7af3e0a,app_domain_b8d325c3,app_domain_b9528b13,app_domain_ba275770,app_domain_c6824def,app_domain_c72257c6,app_domain_cb36afb8,app_domain_d18c63a1,app_domain_d3e7c965,app_domain_d6feb1a4,app_domain_d9b5648e,app_domain_db829551,app_domain_dcb74110,app_domain_df32afa9,app_domain_e25eea83,app_domain_e51135b7,app_domain_e5d5313f,app_domain_e787a6bc,app_domain_ef1fc174,app_domain_f2f777fb,app_domain_f3ad7798,app_domain_f5a7c834,app_domain_fc41b20c,app_domain_fd5f0ee2,app_category_1,app_category_2,app_category_3,app_category_4,app_category_5,app_category_6,app_category_7,app_category_8,app_category_9,app_category_10,app_category_11,app_category_12,app_category_13,app_category_14,app_category_15,app_category_16,app_category_17,app_category_18,app_category_19,app_category_20,app_category_21,app_category_22,C17_122,C17_153,C17_178,C17_196,C17_394,C17_423,C17_479,C17_544,C17_547,C17_549,C17_550,C17_571,C17_572,C17_576,C17_613,C17_686,C17_761,C17_768,C17_827,C17_898,C17_901,C17_906,C17_937,C17_1008,C17_1092,C17_1107,C17_1141,C17_1149,C17_1174,C17_1248,C17_1255,C17_1272,C17_1401,C17_1447,C17_1507,C17_1515,C17_1528,C17_1637,C17_1685,C17_1698,C17_1722,C17_1740,C17_1752,C17_1769,C17_1780,C17_1784,C17_1800,C17_1835,C17_1863,C17_1872,C17_1873,C17_1882,C17_1884,C17_1887,C17_1895,C17_1899,C17_1921,C17_1926,C17_1932,C17_1934,C17_1939,C17_1955,C17_1960,C17_1965,C17_1973,C17_1974,C17_1991,C17_1993,C17_1994,C17_1996,C17_2036,C17_2039,C17_2043,C17_2060,C17_2083,C17_2084,C17_2101,C17_2153,C17_2154,C17_2158,C17_2161,C17_2162,C17_2187,C17_2201,C17_2206,C17_2218,C17_2225,C17_2227,C17_2229,C17_2242,C17_2250,C17_2253,C17_2260,C17_2263,C17_2264,C17_2270,C17_2271,C17_2278,C17_2279,C17_2281,C17_2282,C17_2283,C17_2284,C17_2285,C17_2286,C17_2292,C17_2295,C17_2303,C17_2304,C17_2306,C17_2307,C17_2312,C17_2316,C17_2323,C17_2325,C17_2331,C17_2333,C17_2339,C17_2346,C17_2348,C17_2351,C17_2371,C17_2372,C17_2374,C17_2390,C17_2394,C17_2397,C17_2418,C17_2420,C17_2421,C17_2424,C17_2425,C17_2427,C17_2434,C17_2438,C17_2443,C17_2446,C17_2449,C17_2459,C17_2465,C17_2467,C17_2471,C17_2476,C17_2478,C17_2480,C17_2481,C17_2483,C17_2485,C17_2487,C17_2489,C17_2492,C17_2493,C17_2494,C17_2495,C17_2496,C17_2497,C19_35,C19_39,C19_41,C19_43,C19_47,C19_161,C19_163,C19_167,C19_169,C19_171,C19_175,C19_291,C19_295,C19_297,C19_299,C19_303,C19_423,C19_425,C19_427,C19_431,C19_547,C19_551,C19_559,C19_673,C19_679,C19_681,C19_683,C19_687,C19_801,C19_813,C19_943,C19_1063,C19_1071,C19_1315,C19_1319,C19_1327,C19_1451,C19_1711,C19_1835,C20_100000,C20_100001,C20_100002,C20_100003,C20_100004,C20_100005,C20_100010,C20_100012,C20_100013,C20_100016,C20_100019,C20_100020,C20_100021,C20_100022,C20_100024,C20_100025,C20_100026,C20_100028,C20_100029,C20_100031,C20_100032,C20_100033,C20_100034,C20_100037,C20_100039,C20_100040,C20_100041,C20_100043,C20_100046,C20_100048,C20_100049,C20_100050,C20_100051,C20_100052,C20_100053,C20_100054,C20_100055,C20_100056,C20_100057,C20_100058,C20_100060,C20_100061,C20_100062,C20_100063,C20_100064,C20_100065,C20_100068,C20_100070,C20_100071,C20_100072,C20_100073,C20_100074,C20_100075,C20_100076,C20_100077,C20_100079,C20_100081,C20_100083,C20_100084,C20_100086,C20_100087,C20_100088,C20_100090,C20_100091,C20_100093,C20_100094,C20_100095,C20_100096,C20_100097,C20_100099,C20_100101,C20_100103,C20_100105,C20_100106,C20_100107,C20_100108,C20_100109,C20_100111,C20_100112,C20_100113,C20_100114,C20_100117,C20_100119,C20_100121,C20_100122,C20_100123,C20_100124,C20_100126,C20_100128,C20_100130,C20_100131,C20_100133,C20_100135,C20_100137,C20_100138,C20_100139,C20_100141,C20_100143,C20_100144,C20_100148,C20_100149,C20_100150,C20_100151,C20_100152,C20_100155,C20_100156,C20_100160,C20_100161,C20_100162,C20_100163,C20_100165,C20_100166,C20_100168,C20_100170,C20_100171,C20_100172,C20_100173,C20_100176,C20_100177,C20_100178,C20_100179,C20_100181,C20_100182,C20_100183,C20_100185,C20_100186,C20_100188,C20_100189,C20_100190,C20_100191,C20_100192,C20_100193,C20_100194,C20_100195,C20_100199,C20_100200,C20_100202,C20_100205,C20_100206,C20_100210,C20_100212,C20_100213,C20_100215,C20_100217,C20_100221,C20_100224,C20_100225,C20_100228,C20_100229,C20_100233,C20_100241,C20_100244,C20_100248,C21_15,C21_16,C21_17,C21_20,C21_23,C21_32,C21_33,C21_42,C21_43,C21_46,C21_48,C21_51,C21_52,C21_61,C21_68,C21_69,C21_70,C21_71,C21_79,C21_82,C21_85,C21_91,C21_93,C21_94,C21_95,C21_100,C21_101,C21_111,C21_112,C21_116,C21_117,C21_156,C21_157,C21_195
0,0,2,0,5,1,0,3,2,3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,0,5,1,2,3,2,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,0,5,1,0,3,2,3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,2,1,18,1,0,3,2,3,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2,0,1,1,0,3,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
df.shape

(300000, 525)

In [37]:
X = df.drop("click", axis=1)
y = df["click"]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

confusion_matrix(y_test, y_pred)

Accuracy: 0.6746333333333333


array([[17881, 12026],
       [ 7496, 22597]], dtype=int64)