# Data Transformation
**input:** .csv files directly exported from the [CanYouReadIt?](https://canyoureadit.com) project's database tables at **4/14/2022.**
**output:** single .csv file ready for EDA (Exploratory Data Analysis)

*Note:* With each request the user is presented with a random image, chosen from the /img directory. They can decide to skip the image (-1 response value), or rate its readability the image. The rating can be yes (+1 response value) or no (0 response value).

In [1]:
try:
    import pandas as pd
    print(f"Pandas version: {pd.__version__} found!")
except ModuleNotFoundError:
    print("Install Pandas with: conda install pandas")
import numpy as np
from constants import (TIMESTAMP, IMAGES, DESKTOP, MOBILE, SAVEPATH, T)

Pandas version: 1.4.1 found!


In [2]:
desktop_df = pd.read_csv(DESKTOP,
                 sep=',',
                 encoding='utf-8',
                 parse_dates = ["create_dt"])
desktop_df['mobile_user']=False
desktop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37453 entries, 0 to 37452
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           37453 non-null  int64         
 1   image_id     37453 non-null  int64         
 2   response     37453 non-null  int64         
 3   create_dt    37453 non-null  datetime64[ns]
 4   session_id   37453 non-null  object        
 5   mobile_user  37453 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(3), object(1)
memory usage: 1.5+ MB


In [3]:
mobile_df = pd.read_csv(MOBILE,
                 sep=',',
                 encoding='utf-8',
                 parse_dates = ["create_dt"])
mobile_df['mobile_user']=True
mobile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16245 entries, 0 to 16244
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           16245 non-null  int64         
 1   image_id     16245 non-null  int64         
 2   response     16245 non-null  int64         
 3   create_dt    16245 non-null  datetime64[ns]
 4   session_id   16245 non-null  object        
 5   mobile_user  16245 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(3), object(1)
memory usage: 650.6+ KB


In [4]:
df = pd.concat([desktop_df, mobile_df], axis=0)
df = df.drop(columns=['id']).sort_values(by=['create_dt'])
df

Unnamed: 0,image_id,response,create_dt,session_id,mobile_user
0,6,1,2021-04-20 20:23:08,c9e86836-ad35-4e16-a4f7-8b273b496570,False
1,4,1,2021-04-21 11:33:17,cc681fa6-f357-4b9c-a268-a986ac9960f1,False
2,9,1,2021-04-21 11:33:19,cc681fa6-f357-4b9c-a268-a986ac9960f1,False
3,5,1,2021-04-21 12:42:57,cc681fa6-f357-4b9c-a268-a986ac9960f1,False
4,11,1,2021-04-21 12:43:07,cc681fa6-f357-4b9c-a268-a986ac9960f1,False
...,...,...,...,...,...
16240,385,-1,2022-04-16 09:41:14,c943343f-6e09-40ff-a49c-49908dd40d85,True
16241,355,-1,2022-04-16 10:06:46,011dbd52-2840-4acb-80eb-7cca289e1da4,True
16242,39,0,2022-04-16 10:18:40,cf307e26-6bd2-4644-8ab9-2983493b0022,True
16243,145,0,2022-04-16 10:24:25,ff2ee2f6-8d16-47c1-b625-63ca5242f1b1,True


In [5]:
images_df = pd.read_csv(IMAGES,
                 sep=',',
                 encoding='utf-8')
images_df['is_mobile'] = images_df['is_mobile'].astype(bool)
images_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548 entries, 0 to 547
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                548 non-null    int64 
 1   url               349 non-null    object
 2   pending_approval  548 non-null    int64 
 3   is_mobile         548 non-null    bool  
dtypes: bool(1), int64(2), object(1)
memory usage: 13.5+ KB


In [6]:
print(len(images_df[images_df['is_mobile'] == True]), 'mobile images out of', len(images_df))

185 mobile images out of 548


In [7]:
images_dict = pd.Series(images_df.is_mobile.values,index=images_df.id).to_dict()

In [8]:
df['mobile_image'] = df['image_id']
df['mobile_image'] = df['mobile_image'].map(images_dict)

In [9]:
df[(df['mobile_user'] == True) & (df['mobile_image'] == False)]

Unnamed: 0,image_id,response,create_dt,session_id,mobile_user,mobile_image
0,11,-1,2021-04-21 19:45:05,9ae1dc88-c633-4ed7-9a3e-9e55ac11d554,True,False
1,8,1,2021-04-22 08:06:01,66924c76-e246-4fd4-9342-e5ab2b83df6e,True,False
2,9,1,2021-04-22 10:04:21,9ae1dc88-c633-4ed7-9a3e-9e55ac11d554,True,False
3,11,0,2021-04-22 10:06:49,9ae1dc88-c633-4ed7-9a3e-9e55ac11d554,True,False
4,10,1,2021-04-23 12:32:05,54db6e88-9735-43c3-9575-25146e96ce31,True,False
...,...,...,...,...,...,...
15882,272,-1,2022-04-13 09:12:29,fd8e726a-26a8-413b-8a6d-aaf5854d53e0,True,False
15942,11,1,2022-04-13 23:12:09,30fff8bf-79b4-4cdf-b1fc-924012a8d5aa,True,False
16068,545,0,2022-04-14 21:56:47,935fef3a-735f-4bc7-869c-067a44c88645,True,False
16119,545,-1,2022-04-15 07:44:44,ce5c1b7b-3b99-4d68-a79c-03ad72cba119,True,False


In [10]:
df[(df['mobile_user'] == False) & (df['mobile_image']== True)]

Unnamed: 0,image_id,response,create_dt,session_id,mobile_user,mobile_image
71,32,1,2021-04-26 13:30:23,cc681fa6-f357-4b9c-a268-a986ac9960f1,False,True
161,93,1,2021-04-28 15:34:32,9ae1dc88-c633-4ed7-9a3e-9e55ac11d554,False,True
35918,107,-1,2022-04-08 13:51:51,d58dba04-e67c-4e49-b4e7-bf5f8d93f94c,False,True
35919,324,-1,2022-04-08 13:52:36,d58dba04-e67c-4e49-b4e7-bf5f8d93f94c,False,True
35920,114,-1,2022-04-08 13:52:40,d58dba04-e67c-4e49-b4e7-bf5f8d93f94c,False,True


# Data cleansing

In [11]:
df.drop(df[(df['mobile_user'] == False) & (df['mobile_image']== True)].index, inplace=True)
errormessage = '''We expect to have zero mobile images for desktop users.
    If you see this message, please check the data and re-run the notebook.'''
assert len(df[(df['mobile_user'] == False) & (df['mobile_image']== True)])==0, errormessage

# Session ID factorization

In [12]:
df['session_id'] = pd.factorize(df['session_id'])[0]
print('Number of unique sessions:',len(df['session_id'].unique()))

Number of unique sessions: 48807


# Convert **Response** column to human readable format
With each request the user is presented with a random image, chosen from the /img directory. They can decide to skip the image (-1 response value), or rate its readability the image. The rating can be yes (+1 response value) or no (0 response value).

In [14]:
# df.loc[df['response'] < -1, 'response'] = -1
errormessage = '''We expect to have answers of either -1, 0 or 1.
    If you see this message, please check the data.'''
assert len(df['response'].unique()) < 4, errormessage

In [15]:
df['response_yes'] = np.where(df['response'] == 1, True, False)
df['response_no'] = np.where(df['response'] == 0, True, False)
df['response_skip'] = np.where(df['response'] == -1, True, False)

In [16]:
df.drop(columns=['response'], inplace=True)
df

Unnamed: 0,image_id,create_dt,session_id,mobile_user,mobile_image,response_yes,response_no,response_skip
0,6,2021-04-20 20:23:08,0,False,False,True,False,False
1,4,2021-04-21 11:33:17,1,False,False,True,False,False
2,9,2021-04-21 11:33:19,1,False,False,True,False,False
3,5,2021-04-21 12:42:57,1,False,False,True,False,False
4,11,2021-04-21 12:43:07,1,False,False,True,False,False
...,...,...,...,...,...,...,...,...
16240,385,2022-04-16 09:41:14,48802,True,True,False,False,True
16241,355,2022-04-16 10:06:46,48803,True,True,False,False,True
16242,39,2022-04-16 10:18:40,48804,True,True,False,True,False
16243,145,2022-04-16 10:24:25,48805,True,True,False,True,False


# Save the results to a single .csv in data

In [17]:
df.to_csv(SAVEPATH, sep=',')
print(f"{T.G}Results saved to: {SAVEPATH}{T.E}")

[92mResults saved to: data/canyoureadit.csv[0m
