### Social Buzz
#### An analysis of their content categories that highlights the top 5 categories with the largest aggregate popularity

##### Tasks

- Clean the data by:
removing rows that have values which are missing,
changing the data type of some values within a column, and
removing columns which are not relevant to this task.
Think about how each column might be relevant to the business question you’re investigating. If you can’t think of why a column may be useful, it may not be worth including it.

In [55]:
import pandas as pd
import matplotlib.pyplot as plt

In [56]:
reactions_df = pd.read_csv(r'Reactions.csv')
content_df = pd.read_csv(r'Content.csv')
reactionType_df = pd.read_csv(r'ReactionTypes.csv')

In [58]:
reactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25553 entries, 0 to 25552
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  25553 non-null  int64 
 1   Content ID  25553 non-null  object
 2   User ID     22534 non-null  object
 3   Type        24573 non-null  object
 4   Datetime    25553 non-null  object
dtypes: int64(1), object(4)
memory usage: 998.3+ KB


In [59]:
reactions_df.head(2)

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Datetime
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50


In [60]:
reactions_df.isnull().sum()

Unnamed: 0       0
Content ID       0
User ID       3019
Type           980
Datetime         0
dtype: int64

In [61]:
reactions_new = reactions_df.dropna()

In [62]:
reactions_new = reactions_new.drop(columns=['Unnamed: 0', 'User ID', 'Datetime'])

In [63]:
reactions_new.isnull().sum()

Content ID    0
Type          0
dtype: int64

In [64]:
reactions_new.head(2)

Unnamed: 0,Content ID,Type
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike


In [65]:
content_df.isnull().sum()

Unnamed: 0      0
Content ID      0
User ID         0
Type            0
Category        0
URL           199
dtype: int64

In [66]:
content_df.head(2)

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Category,URL
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...


In [67]:
content_new = content_df.drop(columns=['Unnamed: 0','User ID','URL'])

In [68]:
content_new.isnull().sum()

Content ID    0
Type          0
Category      0
dtype: int64

In [69]:
content_new.head(2)

Unnamed: 0,Content ID,Type,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,Studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,photo,healthy eating


In [70]:
reactionType_df.isnull().sum()

Unnamed: 0    0
Type          0
Sentiment     0
Score         0
dtype: int64

In [71]:
reactionType_new = reactionType_df.drop(columns=['Unnamed: 0','Sentiment'])

In [72]:
reactionType_new.head(2)

Unnamed: 0,Type,Score
0,heart,60
1,want,70


In [73]:
content_new.rename(columns={'Type':'ContentType'}, inplace=True)

In [74]:
content_new['ContentType'].unique()

array(['photo', 'video', 'GIF', 'audio'], dtype=object)

In [75]:
content_new['Category'].unique()

array(['Studying', 'healthy eating', 'technology', 'food', 'cooking',
       'dogs', 'soccer', 'public speaking', 'science', 'tennis', 'travel',
       'fitness', 'education', 'studying', 'veganism', 'Animals',
       'animals', 'culture', '"culture"', 'Fitness', '"studying"',
       'Veganism', '"animals"', 'Travel', '"soccer"', 'Education',
       '"dogs"', 'Technology', 'Soccer', '"tennis"', 'Culture', '"food"',
       'Food', '"technology"', 'Healthy Eating', '"cooking"', 'Science',
       '"public speaking"', '"veganism"', 'Public Speaking', '"science"'],
      dtype=object)

In [76]:
content_new['Category'] = content_new['Category'].str.replace('"','')

In [77]:
content_new['Category'] = content_new['Category'].str.lower()

In [78]:
reactions_new['Type'].unique()

array(['disgust', 'dislike', 'scared', 'interested', 'peeking', 'cherish',
       'hate', 'indifferent', 'super love', 'worried', 'like', 'heart',
       'want', 'intrigued', 'love', 'adore'], dtype=object)

In [79]:
reactionType_new['Type'].unique()

array(['heart', 'want', 'disgust', 'hate', 'interested', 'indifferent',
       'love', 'super love', 'cherish', 'adore', 'like', 'dislike',
       'intrigued', 'peeking', 'scared', 'worried'], dtype=object)

In [80]:
# reactions_new.to_csv('cleaned_reactions.csv', index=False)
# reactionType_new.to_csv('cleaned_reactionTypes.csv', index=False)
# content_new.to_csv('cleaned_content.csv', index=False)

In [82]:
reactions_new.columns, content_new.columns, reactionType_new.columns

(Index(['Content ID', 'Type'], dtype='object'),
 Index(['Content ID', 'ContentType', 'Category'], dtype='object'),
 Index(['Type', 'Score'], dtype='object'))

In [84]:
rec_con = reactions_new.merge(content_new, how='left', on='Content ID')

rec_con.head()

Unnamed: 0,Content ID,Type,ContentType,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,photo,studying
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,photo,studying
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,photo,studying
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,photo,studying
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,photo,studying


In [85]:
full_merged = rec_con.merge(reactionType_new, how='left', on='Type')

full_merged

Unnamed: 0,Content ID,Type,ContentType,Category,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,photo,studying,0
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,photo,studying,10
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,photo,studying,15
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,photo,studying,0
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,photo,studying,30
...,...,...,...,...,...
22529,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,audio,technology,12
22530,75d6b589-7fae-4a6d-b0d0-752845150e56,dislike,audio,technology,10
22531,75d6b589-7fae-4a6d-b0d0-752845150e56,intrigued,audio,technology,45
22532,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,audio,technology,12


In [87]:
full_merged.rename(columns={'Type':'ReactionType'}, inplace=True)

In [88]:
full_merged.head()

Unnamed: 0,Content ID,ReactionType,ContentType,Category,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,photo,studying,0
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,photo,studying,10
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,photo,studying,15
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,photo,studying,0
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,photo,studying,30


In [89]:
# full_merged.to_csv('merged_reactions.csv',index=False)

In [90]:
full_merged

Unnamed: 0,Content ID,ReactionType,ContentType,Category,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,photo,studying,0
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,photo,studying,10
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,photo,studying,15
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,photo,studying,0
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,photo,studying,30
...,...,...,...,...,...
22529,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,audio,technology,12
22530,75d6b589-7fae-4a6d-b0d0-752845150e56,dislike,audio,technology,10
22531,75d6b589-7fae-4a6d-b0d0-752845150e56,intrigued,audio,technology,45
22532,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,audio,technology,12
