# Forage Accenture Data Analytics Task 2 : Data Cleaning and Modelling

## A. Importing the necessary modules

In [92]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## B. Reading the files

In [94]:
content = pd.read_csv('Content.csv')
reaction = pd.read_csv('Reactions.csv')
reaction_type = pd.read_csv('ReactionTypes.csv')

In [95]:
content.info(), reaction.info(), reaction_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   Content ID  1000 non-null   object
 2   User ID     1000 non-null   object
 3   Type        1000 non-null   object
 4   Category    1000 non-null   object
 5   URL         801 non-null    object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25553 entries, 0 to 25552
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  25553 non-null  int64 
 1   Content ID  25553 non-null  object
 2   User ID     22534 non-null  object
 3   Type        24573 non-null  object
 4   Datetime    25553 non-null  object
dtypes: int64(1), object(4)
memory usage: 998.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (

(None, None, None)

In [96]:
content.head()

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Category,URL
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


In [97]:
reaction.head()

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Datetime
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50
2,2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,2021-06-17 12:22:51
3,3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,2021-04-18 05:13:58
4,4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,2021-01-06 19:13:01


In [98]:
reaction_type.head()

Unnamed: 0.1,Unnamed: 0,Type,Sentiment,Score
0,0,heart,positive,60
1,1,want,positive,70
2,2,disgust,negative,0
3,3,hate,negative,5
4,4,interested,positive,30


## C. Dropping the 'Unnamed : 0' column from each of the dataset

In [100]:
content=content.drop(['Unnamed: 0'],axis=1)
reaction=reaction.drop(['Unnamed: 0'],axis=1)
reaction_type=reaction_type.drop(['Unnamed: 0'],axis=1)

## D. Dropping the 'User ID' column from content and reaction datasets cause we only need data related to reactions and content

In [102]:
content=content.drop(['User ID'],axis=1)
reaction=reaction.drop(['User ID'],axis=1)

In [103]:
content=content.drop(['URL'],axis=1)

In [104]:
# Renaming type column 

content.rename(columns={'Type': 'content_type'}, inplace=True)
reaction.rename(columns={'Type': 'reaction_type'}, inplace=True)
reaction_type.rename(columns={'Type': 'reaction_type'}, inplace=True)

## E. Removing the rows from all datasets that consists of null values or blank spaces

In [106]:
content = content.dropna(how='any')
reaction = reaction.dropna(how='any')
reaction_type = reaction_type.dropna(how='any')

In [107]:
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Content ID    1000 non-null   object
 1   content_type  1000 non-null   object
 2   Category      1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


In [108]:
reaction.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24573 entries, 1 to 25552
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Content ID     24573 non-null  object
 1   reaction_type  24573 non-null  object
 2   Datetime       24573 non-null  object
dtypes: object(3)
memory usage: 767.9+ KB


In [109]:
reaction_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   reaction_type  16 non-null     object
 1   Sentiment      16 non-null     object
 2   Score          16 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 516.0+ bytes


## F. Merging datasets using the inner join

In [111]:
reaction_merged = pd.merge(reaction, reaction_type, on='reaction_type', how='inner')

In [128]:
content_reaction_merged = pd.merge(content, reaction_merged, on='Content ID', how='inner')

In [130]:
content_reaction_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24573 entries, 0 to 24572
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Content ID     24573 non-null  object
 1   content_type   24573 non-null  object
 2   Category       24573 non-null  object
 3   reaction_type  24573 non-null  object
 4   Datetime       24573 non-null  object
 5   Sentiment      24573 non-null  object
 6   Score          24573 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.3+ MB


In [132]:
df=content_reaction_merged.copy()

In [134]:
df.head()

Unnamed: 0,Content ID,content_type,Category,reaction_type,Datetime,Sentiment,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,Studying,disgust,2020-11-07 09:43:50,negative,0
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,Studying,disgust,2021-01-06 19:13:01,negative,0
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,Studying,disgust,2021-04-09 02:46:20,negative,0
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,Studying,dislike,2021-06-17 12:22:51,negative,10
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,Studying,dislike,2020-11-09 02:49:59,negative,10


In [136]:
df.content_type.value_counts()

content_type
photo    6589
video    6245
GIF      6079
audio    5660
Name: count, dtype: int64

In [138]:
df.reaction_type.value_counts()

reaction_type
heart          1622
scared         1572
peeking        1559
hate           1552
interested     1549
dislike        1548
adore          1548
want           1539
love           1534
disgust        1526
like           1520
super love     1519
indifferent    1512
cherish        1501
worried        1497
intrigued      1475
Name: count, dtype: int64

In [142]:
df.Category.value_counts()

Category
animals              1765
healthy eating       1711
technology           1667
science              1662
cooking              1640
travel               1618
food                 1606
culture              1586
education            1397
soccer               1334
tennis               1328
studying             1303
dogs                 1283
fitness              1257
veganism             1200
public speaking      1157
Fitness               138
Science               116
Animals                92
Food                   91
Soccer                 65
"soccer"               58
"dogs"                 55
"culture"              49
Studying               45
Culture                41
"animals"              40
"veganism"             37
Education              36
Public Speaking        32
Travel                 29
"public speaking"      28
"technology"           28
"cooking"              24
"science"              18
"studying"             15
Veganism               11
Healthy Eating          6
Tec

In [154]:
import re

def lowercase_without_quotes(text):
  # Remove leading and trailing quotes
  text = re.sub(r'^"|"$', '', text)
  # Lowercase the remaining text
  return text.lower()

df['Category'] = df['Category'].apply(lowercase_without_quotes)


In [156]:
df['Category'].value_counts()

Category
animals            1897
science            1796
healthy eating     1717
food               1699
technology         1698
culture            1676
cooking            1664
travel             1647
soccer             1457
education          1433
fitness            1395
studying           1363
dogs               1338
tennis             1328
veganism           1248
public speaking    1217
Name: count, dtype: int64

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24573 entries, 0 to 24572
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Content ID     24573 non-null  object
 1   content_type   24573 non-null  object
 2   Category       24573 non-null  object
 3   reaction_type  24573 non-null  object
 4   Datetime       24573 non-null  object
 5   Sentiment      24573 non-null  object
 6   Score          24573 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.3+ MB


In [161]:
df.Sentiment.value_counts()

Sentiment
positive    13807
negative     7695
neutral      3071
Name: count, dtype: int64

In [163]:
df.Score.value_counts()

Score
70    3040
60    1622
15    1572
35    1559
5     1552
30    1549
10    1548
72    1548
65    1534
0     1526
50    1520
75    1519
20    1512
12    1497
45    1475
Name: count, dtype: int64

In [165]:
df.to_csv('Cleaned Dataset.csv', index=False)

In [167]:
df.to_excel('Cleaned Dataset.xlsx', index=False)