# Data Transformation
**input:** .csv files directly exported from the [CanYouReadIt?](https://canyoureadit.com) project's database tables at **4/7/2022.**
**output:** single .csv file ready for EDA (Exploratory Data Analysis)

*Note:* With each request the user is presented with a random image, chosen from the /img directory. They can decide to skip the image (-1 response value), or rate its readability the image. The rating can be yes (+1 response value) or no (0 response value).

In [1]:
try:
    import pandas as pd
    print(f"Pandas version: {pd.__version__} found!")
except ModuleNotFoundError:
    print("Install Pandas with: conda install pandas")
import numpy as np
from constants import (TIMESTAMP, IMAGES, DESKTOP, MOBILE, SAVEPATH, T)

Pandas version: 1.4.1 found!


In [2]:
desktop_df = pd.read_csv(DESKTOP,
                 sep=',',
                 encoding='utf-8',
                 parse_dates = ["create_dt"])
desktop_df['mobile_user']=False
desktop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35598 entries, 0 to 35597
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           35598 non-null  int64         
 1   image_id     35598 non-null  int64         
 2   response     35598 non-null  int64         
 3   create_dt    35598 non-null  datetime64[ns]
 4   session_id   35598 non-null  object        
 5   mobile_user  35598 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(3), object(1)
memory usage: 1.4+ MB


In [3]:
mobile_df = pd.read_csv(MOBILE,
                 sep=',',
                 encoding='utf-8',
                 parse_dates = ["create_dt"])
mobile_df['mobile_user']=True
mobile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15566 entries, 0 to 15565
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           15566 non-null  int64         
 1   image_id     15566 non-null  int64         
 2   response     15566 non-null  int64         
 3   create_dt    15566 non-null  datetime64[ns]
 4   session_id   15566 non-null  object        
 5   mobile_user  15566 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(3), object(1)
memory usage: 623.4+ KB


In [4]:
df = pd.concat([desktop_df, mobile_df], axis=0)
df = df.drop(columns=['id']).sort_values(by=['create_dt'])
df

Unnamed: 0,image_id,response,create_dt,session_id,mobile_user
0,6,1,2021-04-20 20:23:08,c9e86836-ad35-4e16-a4f7-8b273b496570,False
1,4,1,2021-04-21 11:33:17,cc681fa6-f357-4b9c-a268-a986ac9960f1,False
2,9,1,2021-04-21 11:33:19,cc681fa6-f357-4b9c-a268-a986ac9960f1,False
3,5,1,2021-04-21 12:42:57,cc681fa6-f357-4b9c-a268-a986ac9960f1,False
4,11,1,2021-04-21 12:43:07,cc681fa6-f357-4b9c-a268-a986ac9960f1,False
...,...,...,...,...,...
35593,341,1,2022-04-07 16:12:55,87488d1c-6943-4a29-9999-1344e416bef8,False
35594,412,1,2022-04-07 16:13:06,87488d1c-6943-4a29-9999-1344e416bef8,False
35595,475,1,2022-04-07 16:13:10,87488d1c-6943-4a29-9999-1344e416bef8,False
35596,226,1,2022-04-07 16:13:49,ae352032-0334-4b74-8d9f-f77231c1bf5b,False


In [5]:
images_df = pd.read_csv(IMAGES,
                 sep=',',
                 encoding='utf-8')
images_df['is_mobile'] = images_df['is_mobile'].astype(bool)
images_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548 entries, 0 to 547
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                548 non-null    int64 
 1   url               349 non-null    object
 2   pending_approval  548 non-null    int64 
 3   is_mobile         548 non-null    bool  
dtypes: bool(1), int64(2), object(1)
memory usage: 13.5+ KB


In [6]:
print(len(images_df[images_df['is_mobile'] == True]), 'mobile images out of', len(images_df))

185 mobile images out of 548


In [7]:
images_dict = pd.Series(images_df.is_mobile.values,index=images_df.id).to_dict()

In [8]:
df['mobile_image'] = df['image_id']
df['mobile_image'] = df['mobile_image'].map(images_dict)

In [9]:
df[(df['mobile_user'] == True) & (df['mobile_image'] == False)]

Unnamed: 0,image_id,response,create_dt,session_id,mobile_user,mobile_image
0,11,-1,2021-04-21 19:45:05,9ae1dc88-c633-4ed7-9a3e-9e55ac11d554,True,False
1,8,1,2021-04-22 08:06:01,66924c76-e246-4fd4-9342-e5ab2b83df6e,True,False
2,9,1,2021-04-22 10:04:21,9ae1dc88-c633-4ed7-9a3e-9e55ac11d554,True,False
3,11,0,2021-04-22 10:06:49,9ae1dc88-c633-4ed7-9a3e-9e55ac11d554,True,False
4,10,1,2021-04-23 12:32:05,54db6e88-9735-43c3-9575-25146e96ce31,True,False
...,...,...,...,...,...,...
13486,11,0,2022-03-02 02:56:35,9475cf86-fdae-4673-a80d-fb38453bccb2,True,False
13514,11,-1,2022-03-02 18:35:46,8f4733ef-740c-43e8-ae76-a1f09a94b31e,True,False
13629,11,1,2022-03-04 17:21:44,d0f36f32-111b-4076-8942-c67755fe1679,True,False
13915,11,0,2022-03-09 18:32:23,02c46bcc-4fd7-4c08-810c-f30367ab121b,True,False


In [10]:
df[(df['mobile_user'] == False) & (df['mobile_image']== True)]

Unnamed: 0,image_id,response,create_dt,session_id,mobile_user,mobile_image
71,32,1,2021-04-26 13:30:23,cc681fa6-f357-4b9c-a268-a986ac9960f1,False,True
161,93,1,2021-04-28 15:34:32,9ae1dc88-c633-4ed7-9a3e-9e55ac11d554,False,True


# Data cleansing

In [11]:
df.drop(df[(df['mobile_user'] == False) & (df['mobile_image']== True)].index, inplace=True)
errormessage = '''We expect to have zero mobile images for desktop users.
    If you see this message, please check the data and re-run the notebook.'''
assert len(df[(df['mobile_user'] == False) & (df['mobile_image']== True)])==0, errormessage

# Session ID factorization

In [12]:
df['session_id'] = pd.factorize(df['session_id'])[0]
print('Number of unique sessions:',len(df['session_id'].unique()))

Number of unique sessions: 46680


# Convert **Response** column to human readable format
With each request the user is presented with a random image, chosen from the /img directory. They can decide to skip the image (-1 response value), or rate its readability the image. The rating can be yes (+1 response value) or no (0 response value).

In [13]:
df.loc[df['response'] < -1, 'response'] = -1
errormessage = '''We expect to have answers of either -1, 0 or 1.
    If you see this message, please check the data.'''
assert len(df['response'].unique()) < 4, errormessage

In [14]:
df['response_yes'] = np.where(df['response'] == 1, True, False)
df['response_no'] = np.where(df['response'] == 0, True, False)
df['response_skip'] = np.where(df['response'] == -1, True, False)

In [15]:
df.drop(columns=['response'], inplace=True)
df

Unnamed: 0,image_id,create_dt,session_id,mobile_user,mobile_image,response_yes,response_no,response_skip
0,6,2021-04-20 20:23:08,0,False,False,True,False,False
1,4,2021-04-21 11:33:17,1,False,False,True,False,False
2,9,2021-04-21 11:33:19,1,False,False,True,False,False
3,5,2021-04-21 12:42:57,1,False,False,True,False,False
4,11,2021-04-21 12:43:07,1,False,False,True,False,False
...,...,...,...,...,...,...,...,...
35593,341,2022-04-07 16:12:55,46678,False,False,True,False,False
35594,412,2022-04-07 16:13:06,46678,False,False,True,False,False
35595,475,2022-04-07 16:13:10,46678,False,False,True,False,False
35596,226,2022-04-07 16:13:49,46679,False,False,True,False,False


# Save the results to a single .csv in data

In [16]:
df.to_csv(SAVEPATH, sep=',')
print(f"{T.G}Results saved to: {SAVEPATH}{T.E}")

[92mResults saved to: data/canyoureadit.csv[0m
