In [1]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
data = pd.read_csv('traffic.csv')

In [3]:
data.head()

Unnamed: 0,event,date,country,city,artist,album,track,isrc,linkid
0,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
1,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
2,click,2021-08-21,India,Ludhiana,Reyanna Maria,So Pretty,So Pretty,USUM72100871,23199824-9cf5-4b98-942a-34965c3b0cc2
3,click,2021-08-21,France,Unknown,"Simone & Simaria, Sebastian Yatra",No Llores Más,No Llores Más,BRUM72003904,35573248-4e49-47c7-af80-08a960fa74cd
4,click,2021-08-21,Maldives,Malé,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226278 entries, 0 to 226277
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   event    226278 non-null  object
 1   date     226278 non-null  object
 2   country  226267 non-null  object
 3   city     226267 non-null  object
 4   artist   226241 non-null  object
 5   album    226273 non-null  object
 6   track    226273 non-null  object
 7   isrc     219157 non-null  object
 8   linkid   226278 non-null  object
dtypes: object(9)
memory usage: 15.5+ MB


In [5]:
data.describe(include='all')

Unnamed: 0,event,date,country,city,artist,album,track,isrc,linkid
count,226278,226278,226267,226267,226241,226273,226273,219157,226278
unique,3,7,211,11993,2419,3254,3562,709,3839
top,pageview,2021-08-19,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
freq,142015,35361,47334,22791,40841,40841,40841,40841,40841


In [6]:
# Convert the 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Step 1: Total and Daily Pageview Events

# Total number of 'pageview' events
total_pageviews = data[data['event'] == 'pageview'].shape[0]

# Filter data for 'click' events
total_clicks = data[data['event'] == 'click'].shape[0]

# Calculate daily 'pageview' events
daily_pageviews = data[data['event'] == 'pageview'].groupby(data['date'].dt.date).size()
average_daily_pageviews = daily_pageviews.mean()

In [7]:
print("Total pageview events:", total_pageviews)
print("Total clicks:", total_clicks)
print("Average pageviews per day:", average_daily_pageviews)

Total pageview events: 142015
Total clicks: 55732
Average pageviews per day: 20287.85714285714


In [8]:
# Step 2: Analysis of Other Events
event_counts = data['event'].value_counts()
event_distribution = (event_counts / event_counts.sum()) * 100

In [9]:
print("\nEvent counts:\n", event_counts)
print("\nEvent distribution (%):\n", event_distribution)


Event counts:
 event
pageview    142015
click        55732
preview      28531
Name: count, dtype: int64

Event distribution (%):
 event
pageview    62.761294
click       24.629880
preview     12.608826
Name: count, dtype: float64


In [10]:
# Step 3: Geographical Distribution
country_pageviews = data[data['event'] == 'pageview']['country'].value_counts()

In [11]:
print("\nTop countries contributing to pageviews:\n", country_pageviews.head(10))


Top countries contributing to pageviews:
 country
Saudi Arabia            28873
India                   27286
United States           20839
France                   9674
Iraq                     4897
United Kingdom           3845
Pakistan                 3212
Germany                  3141
Turkey                   2462
United Arab Emirates     2335
Name: count, dtype: int64


In [12]:
# Step 4: Click-Through Rate (CTR) Analysis

# Calculate the overall CTR
overall_ctr = total_clicks / total_pageviews if total_pageviews > 0 else 0

# Calculate CTR variation across different links
ctr_per_link = data.groupby('linkid').apply(lambda x: x[x['event'] == 'click'].shape[0] / x[x['event'] == 'pageview'].shape[0] if x[x['event'] == 'pageview'].shape[0] > 0 else 0)

  ctr_per_link = data.groupby('linkid').apply(lambda x: x[x['event'] == 'click'].shape[0] / x[x['event'] == 'pageview'].shape[0] if x[x['event'] == 'pageview'].shape[0] > 0 else 0)


In [13]:
print("\nOverall CTR:", overall_ctr)


Overall CTR: 0.3924374185825441


In [14]:
# Step 5: Correlation Analysis

# Step 1: Filter relevant data for clicks and previews
# Calculate the total clicks and previews for each link
link_events = data[data['event'].isin(['click', 'preview'])]
clicks_per_link = link_events[link_events['event'] == 'click'].groupby('linkid').size()
previews_per_link = link_events[link_events['event'] == 'preview'].groupby('linkid').size()

# Merge the clicks and previews into a single DataFrame
correlation_data = pd.DataFrame({'clicks': clicks_per_link, 'previews': previews_per_link}).fillna(0)

# Step 2: Perform Pearson Correlation (Linear Relationship)
pearson_corr, pearson_p_value = stats.pearsonr(correlation_data['clicks'], correlation_data['previews'])

# Step 3: Perform Chi-square Test (Categorical Relationship)
# Convert the data into a contingency table for Chi-square test
contingency_table = pd.crosstab(correlation_data['clicks'] > 0, correlation_data['previews'] > 0)
chi2, chi2_p_value, dof, expected = stats.chi2_contingency(contingency_table)

# Display the results
print("Pearson Correlation Coefficient:", pearson_corr)
print("Pearson Correlation p-value:", pearson_p_value)
print("Chi-square Statistic:", chi2)
print("Chi-square p-value:", chi2_p_value)

Pearson Correlation Coefficient: 0.9886810823208105
Pearson Correlation p-value: 0.0
Chi-square Statistic: 109.07209677403016
Chi-square p-value: 1.5648907281614155e-25


## Conclusion
1. There is a very strong and statistically significant linear correlation between the number of clicks and previews on a link.
2. The categorical relationship between clicks and previews is also statistically significant, indicating a strong association.