# Accenture Content Data Analysis

## Import necessary libraries and datasets

In [85]:
# Import libraries
import pandas as pd

In [86]:
# Import the datasets
content_data = pd.read_csv("Content.csv")
reaction_data = pd.read_csv("Reactions.csv")
reactionType_data = pd.read_csv("ReactionTypes.csv")

## Data cleaning

### 1- Content dataset

In [87]:
# Dataset dimensions
content_data.shape

(1000, 6)

In [88]:
# Preview the dataset
content_data.head(10)

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Category,URL
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...
5,5,cf1e8c1a-23eb-4426-9f58-002fb1b53e91,4607d7b0-3313-49b8-9f73-5b8227fc5b67,GIF,cooking,
6,6,3f8590c7-6ab2-4973-805a-90cdec355f05,ae600af5-c1f0-4b1f-adb0-1b4c246373e4,video,dogs,https://socialbuzz.cdn.com/content/storage/3f8...
7,7,e5490118-90d5-4572-ab1c-1fbc87b8d9ca,583f2bde-886d-4cf3-a5c4-7cb60cd25df3,video,technology,https://socialbuzz.cdn.com/content/storage/e54...
8,8,0bedca96-fb76-4287-a83c-17330ed39cce,2bd9c167-e06c-47c1-a978-3403d6724606,photo,soccer,https://socialbuzz.cdn.com/content/storage/0be...
9,9,b18cb63f-4c8e-44ee-a47f-541e95191d11,13f0db8a-152a-496f-a6e8-1ed6a90b8788,photo,public speaking,https://socialbuzz.cdn.com/content/storage/b18...


In [89]:
# More details about the dataset
content_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   Content ID  1000 non-null   object
 2   User ID     1000 non-null   object
 3   Type        1000 non-null   object
 4   Category    1000 non-null   object
 5   URL         801 non-null    object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB


In [90]:
# Identify missing values
content_data.isna().sum()

Unnamed: 0      0
Content ID      0
User ID         0
Type            0
Category        0
URL           199
dtype: int64

There are only missing values in the "URL" column. Since that variable is not relevant for our analysis, we'll drop it along with the "Unnamed" variable 

In [91]:
# Drop non-relevant columns
content_data = content_data.drop(columns= ["Unnamed: 0","URL", "User ID"])

In [92]:
# Identify duplicates
content_data.duplicated().sum()

0

There is no duplicate rows in the dataset

In [93]:
# Rename columns
content_data.rename(columns={'Type': 'Content Type'}, inplace= True)

In [94]:
# Inspect the unique values of "Content Type"
content_data['Content Type'].unique()

array(['photo', 'video', 'GIF', 'audio'], dtype=object)

In [95]:
# Standardize the typos
content_data['Content Type'] = content_data['Content Type'].str.title()

In [96]:
# Inspect the unique values of "Category"
content_data['Category'].unique()

array(['Studying', 'healthy eating', 'technology', 'food', 'cooking',
       'dogs', 'soccer', 'public speaking', 'science', 'tennis', 'travel',
       'fitness', 'education', 'studying', 'veganism', 'Animals',
       'animals', 'culture', '"culture"', 'Fitness', '"studying"',
       'Veganism', '"animals"', 'Travel', '"soccer"', 'Education',
       '"dogs"', 'Technology', 'Soccer', '"tennis"', 'Culture', '"food"',
       'Food', '"technology"', 'Healthy Eating', '"cooking"', 'Science',
       '"public speaking"', '"veganism"', 'Public Speaking', '"science"'],
      dtype=object)

The unique values are not uniforms: 'culture' and '"animals"'

In [97]:
# Standardize the typos
content_data['Category'] = content_data['Category'].replace('"','', regex=True)
content_data['Category'] = content_data['Category'].str.title()

In [98]:
# Let's check it
content_data['Category'].unique()

array(['Studying', 'Healthy Eating', 'Technology', 'Food', 'Cooking',
       'Dogs', 'Soccer', 'Public Speaking', 'Science', 'Tennis', 'Travel',
       'Fitness', 'Education', 'Veganism', 'Animals', 'Culture'],
      dtype=object)

Job done here

### 2- Reaction dataset

In [99]:
# Dataset dimensions
reaction_data.shape

(25553, 5)

In [100]:
# Preview the dataset
reaction_data.head()

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Datetime
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50
2,2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,2021-06-17 12:22:51
3,3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,2021-04-18 05:13:58
4,4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,2021-01-06 19:13:01


In [101]:
# More details about the dataset
reaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25553 entries, 0 to 25552
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  25553 non-null  int64 
 1   Content ID  25553 non-null  object
 2   User ID     22534 non-null  object
 3   Type        24573 non-null  object
 4   Datetime    25553 non-null  object
dtypes: int64(1), object(4)
memory usage: 998.3+ KB


In [102]:
# Identify missing values
reaction_data.isna().sum()

Unnamed: 0       0
Content ID       0
User ID       3019
Type           980
Datetime         0
dtype: int64

In [103]:
# Drop missing values
reaction_data = reaction_data.dropna()

In [104]:
# Drop irrelevant columns
reaction_data = reaction_data.drop(columns= ['Unnamed: 0', 'User ID'])

In [105]:
# Convert Dates type
reaction_data['Datetime'] = pd.to_datetime(reaction_data['Datetime'])

In [106]:
# Rename column
reaction_data.rename(columns={'Type': 'Reaction Type'}, inplace=True)

In [107]:
# Inspect unique values
reaction_data['Reaction Type'].unique()

array(['disgust', 'dislike', 'scared', 'interested', 'peeking', 'cherish',
       'hate', 'indifferent', 'super love', 'worried', 'like', 'heart',
       'want', 'intrigued', 'love', 'adore'], dtype=object)

In [108]:
# Standardize unique values
reaction_data['Reaction Type'] = reaction_data['Reaction Type'].str.title()

In [109]:
# Data integrity tests
print("Number of missing values:", reaction_data.isna().sum())

Number of missing values: Content ID       0
Reaction Type    0
Datetime         0
dtype: int64


In [110]:
print("Number of duplicates:", reaction_data.duplicated().sum())

Number of duplicates: 0


In [111]:
reaction_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22534 entries, 1 to 25552
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Content ID     22534 non-null  object        
 1   Reaction Type  22534 non-null  object        
 2   Datetime       22534 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 704.2+ KB


### 3- ReactionType dataset

In [112]:
# Dataset dimensions
reactionType_data.shape

(16, 4)

In [113]:
# Preview dataset
reactionType_data.head()

Unnamed: 0.1,Unnamed: 0,Type,Sentiment,Score
0,0,heart,positive,60
1,1,want,positive,70
2,2,disgust,negative,0
3,3,hate,negative,5
4,4,interested,positive,30


In [114]:
# More details
reactionType_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  16 non-null     int64 
 1   Type        16 non-null     object
 2   Sentiment   16 non-null     object
 3   Score       16 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 640.0+ bytes


In [115]:
# Drop irrelevant column
reactionType_data = reactionType_data.drop(columns= "Unnamed: 0")

In [116]:
# Rename column
reactionType_data.rename(columns={'Type': 'Reaction Type'}, inplace=True)

In [117]:
# Inspect unique values
reactionType_data['Reaction Type'].unique()

array(['heart', 'want', 'disgust', 'hate', 'interested', 'indifferent',
       'love', 'super love', 'cherish', 'adore', 'like', 'dislike',
       'intrigued', 'peeking', 'scared', 'worried'], dtype=object)

In [118]:
reactionType_data['Sentiment'].unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [119]:
# Standardize typos
reactionType_data['Reaction Type'] = reactionType_data['Reaction Type'].str.title()
reactionType_data['Sentiment'] = reactionType_data['Sentiment'].str.title()

In [120]:
# Data integrity tests
print("Number of missing values:", reactionType_data.isna().sum())

Number of missing values: Reaction Type    0
Sentiment        0
Score            0
dtype: int64


In [121]:
print("Number of duplicates:", reactionType_data.duplicated().sum())

Number of duplicates: 0


In [122]:
reactionType_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Reaction Type  16 non-null     object
 1   Sentiment      16 non-null     object
 2   Score          16 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 512.0+ bytes


Data cleaned

### Merge the datasets

In [123]:
# Merge Reactions with Reaction Types to get 'Sentiment' and 'Score'
reaction_data = pd.merge(reaction_data, reactionType_data, on='Reaction Type', how='left')

# Merge Reactions with Content to get 'Content Type' and 'Category'
reaction_data = pd.merge(reaction_data, content_data, on='Content ID', how='left')

# Display the updated DataFrame
reaction_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22534 entries, 0 to 22533
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Content ID     22534 non-null  object        
 1   Reaction Type  22534 non-null  object        
 2   Datetime       22534 non-null  datetime64[ns]
 3   Sentiment      22534 non-null  object        
 4   Score          22534 non-null  int64         
 5   Content Type   22534 non-null  object        
 6   Category       22534 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 1.4+ MB


In [None]:
# Display the updated DataFrame
reaction_data.head()

Unnamed: 0,Content ID,Reaction Type,Datetime,Sentiment,Score,Content Type,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,Disgust,2020-11-07 09:43:50,Negative,0,Photo,Studying
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,Dislike,2021-06-17 12:22:51,Negative,10,Photo,Studying
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,Scared,2021-04-18 05:13:58,Negative,15,Photo,Studying
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,Disgust,2021-01-06 19:13:01,Negative,0,Photo,Studying
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,Interested,2020-08-23 12:25:58,Positive,30,Photo,Studying


In [126]:
# Save clean data
reaction_data.to_csv('./clean_data.csv')