In [1]:
import pandas as pd

df = pd.read_csv('ad_click.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.3+ KB


In [3]:
df.describe()

Unnamed: 0,id,age,click
count,10000.0,5234.0,10000.0
mean,5060.2114,40.197363,0.65
std,2861.758265,13.12642,0.476993
min,5.0,18.0,0.0
25%,2529.0,29.0,0.0
50%,5218.0,39.5,1.0
75%,7466.0,52.0,1.0
max,10000.0,64.0,1.0


In [4]:
# Converting all the columns to the right datatype.
df['gender'] = df['gender'].astype('category')
df['device_type'] = df['device_type'].astype('category')
df['ad_position'] = df['ad_position'].astype('category')
df['browsing_history'] = df['browsing_history'].astype('category')
df['time_of_day'] = df['time_of_day'].astype('category')

In [5]:
df

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,,,1
2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0
...,...,...,...,...,...,...,...,...,...
9995,8510,User8510,,,Mobile,Top,Education,,0
9996,7843,User7843,,Female,Desktop,Bottom,Entertainment,,0
9997,3914,User3914,,Male,Mobile,Side,,Morning,0
9998,7924,User7924,,,Desktop,,Shopping,Morning,1


Not sure whether dropping the null values will give the best results for predicting ad click or whether it's better to leave it in. I create 2 datasets one with the null values and one without to try and test them both. 

In [6]:
df_no_nulls = df.dropna()

In [7]:
# Without any nulls, the dataset has 816 records.
df_no_nulls

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
17,188,User188,56.0,Female,Tablet,Bottom,News,Morning,1
25,4890,User4890,43.0,Male,Tablet,Bottom,Education,Afternoon,1
33,4985,User4985,37.0,Male,Mobile,Top,News,Evening,0
52,9888,User9888,49.0,Male,Mobile,Top,News,Morning,1
102,8201,User8201,59.0,Female,Desktop,Bottom,Social Media,Morning,0
...,...,...,...,...,...,...,...,...,...
9951,7268,User7268,28.0,Female,Desktop,Bottom,News,Evening,1
9952,5912,User5912,41.0,Non-Binary,Mobile,Side,Education,Night,1
9960,9638,User9638,64.0,Non-Binary,Desktop,Top,Entertainment,Morning,0
9986,5574,User5574,52.0,Female,Desktop,Bottom,Shopping,Afternoon,1


In [8]:
df.to_csv('ad_click_cleaned.csv')
df_no_nulls.to_csv('ad_click_no_nulls_cleaned.csv')

In [9]:
click_df = df[df['click']==1]
not_click_df = df[df['click']==0]

In [10]:
print('overall:')
print(df[['gender']].value_counts(normalize=True)*100)
print('')
print('Clicked on ad:')
print(click_df[['gender']].value_counts(normalize=True)*100)

overall:
gender    
Female        34.558131
Male          34.105898
Non-Binary    31.335971
Name: proportion, dtype: float64

Clicked on ad:
gender    
Female        34.966657
Male          34.531748
Non-Binary    30.501595
Name: proportion, dtype: float64


In [11]:
print('overall:')
print(df[['device_type']].value_counts(normalize=True)*100)
print('')
print('Clicked on ad:')
print(click_df[['device_type']].value_counts(normalize=True)*100)

overall:
device_type
Desktop        34.4250
Mobile         33.1125
Tablet         32.4625
Name: proportion, dtype: float64

Clicked on ad:
device_type
Desktop        35.240848
Tablet         32.427746
Mobile         32.331407
Name: proportion, dtype: float64


In [12]:
print('overall:')
print(df[['ad_position']].value_counts(normalize=True)*100)
print('')
print('Clicked on ad:')
print(click_df[['ad_position']].value_counts(normalize=True)*100)

overall:
ad_position
Bottom         35.2125
Top            32.4625
Side           32.3250
Name: proportion, dtype: float64

Clicked on ad:
ad_position
Bottom         37.102338
Top            31.602146
Side           31.295516
Name: proportion, dtype: float64


In [13]:
print('overall:')
print(df[['browsing_history']].value_counts(normalize=True)*100)
print('')
print('Clicked on ad:')
print(click_df[['browsing_history']].value_counts(normalize=True)*100)

overall:
browsing_history
Entertainment       22.518206
Social Media        20.199310
Education           19.720199
Shopping            18.857800
News                18.704484
Name: proportion, dtype: float64

Clicked on ad:
browsing_history
Entertainment       24.220624
Social Media        20.083933
Education           19.364508
Shopping            19.034772
News                17.296163
Name: proportion, dtype: float64


In [14]:
print('overall:')
print(df[['time_of_day']].value_counts(normalize=True)*100)
print('')
print('Clicked on ad:')
print(click_df[['time_of_day']].value_counts(normalize=True)*100)

overall:
time_of_day
Morning        26.575
Afternoon      25.200
Evening        24.475
Night          23.750
Name: proportion, dtype: float64

Clicked on ad:
time_of_day
Morning        27.108896
Afternoon      26.495399
Evening        23.638804
Night          22.756902
Name: proportion, dtype: float64


## Chi-Square

I want to test whether any of the columns are actually significant in determining whether the user will actually click on the ads. I will use the statistical method of 'chi square'.

In [15]:
# Doing a cross tab off all the columns to get a count of people who clicked on the ad and didn't.

ad_position_counts = pd.crosstab(df['ad_position'], df['click'])
gender_counts = pd.crosstab(df['gender'], df['click'])
device_type_counts = pd.crosstab(df['device_type'], df['click'])
ad_position_counts = pd.crosstab(df['ad_position'], df['click'])
browsing_history_counts = pd.crosstab(df['browsing_history'], df['click'])
time_of_day_counts = pd.crosstab(df['time_of_day'], df['click'])

In [16]:
ad_position_counts

click,0,1
ad_position,Unnamed: 1_level_1,Unnamed: 2_level_1
Bottom,881,1936
Side,953,1633
Top,948,1649


Chi-square statistic: This represents how much the observed data deviates from the data if the ad being clicked was completely independent of the variable we are looking into. So the higher this value, the more important the variable is in determining whether an ad is clicked or not.

P-value: The significance level is 0.05, if the p-value is less than this, this means that the variable is significant in determining whether an ad is clicked (they are not independent).

In [17]:
from scipy.stats import chi2_contingency

# Perform the Chi-Square test
chi2, p, dof, expected = chi2_contingency(ad_position_counts)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

Chi-Square Statistic: 23.56024306000087
P-Value: 7.65522964467335e-06
Degrees of Freedom: 2
Expected Frequencies:
[[ 979.61175 1837.38825]
 [ 899.2815  1686.7185 ]
 [ 903.10675 1693.89325]]


The statistic is quite high, so the observed results is quite different from the expected results if ad-position and ad-click were independent of each other.
The p-value is less than 0.05 so, ad-postion in important in determining whether an ad was clicked.

overall, ad position is significant in determining whether an ad is clicked or not.

In [18]:
chi2, p, dof, expected = chi2_contingency(gender_counts)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

Chi-Square Statistic: 3.18823440942699
P-Value: 0.20308773428446925
Degrees of Freedom: 2
Expected Frequencies:
[[ 642.09006972 1191.90993028]
 [ 633.68758244 1176.31241756]
 [ 582.22234784 1080.77765216]]


In [19]:
chi2, p, dof, expected = chi2_contingency(device_type_counts)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

Chi-Square Statistic: 5.58488201060546
P-Value: 0.061271467246694744
Degrees of Freedom: 2
Expected Frequencies:
[[ 967.3425  1786.6575 ]
 [ 930.46125 1718.53875]
 [ 912.19625 1684.80375]]


In [20]:
chi2, p, dof, expected = chi2_contingency(browsing_history_counts)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

Chi-Square Statistic: 22.52012529260599
P-Value: 0.0001578736377495927
Degrees of Freedom: 4
Expected Frequencies:
[[371.13415102 657.86584898]
 [423.79264086 751.20735914]
 [352.01839785 623.98160215]
 [354.90379456 629.09620544]
 [380.15101571 673.84898429]]


In [21]:
chi2, p, dof, expected = chi2_contingency(time_of_day_counts)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

Chi-Square Statistic: 22.09465022539211
P-Value: 6.23389067217068e-05
Degrees of Freedom: 3
Expected Frequencies:
[[ 701.568 1314.432]
 [ 681.384 1276.616]
 [ 739.848 1386.152]
 [ 661.2   1238.8  ]]


From the chi-square statistics and p-values, the variables that are actually significant in determining whether an ad is clicked or not are:
- Ad position
- Browsing history 
- Time of day

We can exclude the device type and gender.

Now we know what variables are relevent, we can use these to build a model which can attempt to predict whether users will click on a ad.