# DATA CLEANING GOAL 1 ANALYZE AND CLEANING

In [2]:
import pandas as pd
import numpy as np

Load Data set

In [3]:
#this will be our data sentiment data set
df = pd.read_csv('sentimentdataset.csv')

In [4]:
#show data set
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,15/01/2023 12:30,User123,Twitter,#Nature #Park,15,30,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,15/01/2023 8:45,CommuterX,Twitter,#Traffic #Morning,5,10,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,15/01/2023 15:45,FitnessFan,Instagram,#Fitness #Workout,20,40,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,15/01/2023 18:20,AdventureX,Facebook,#Travel #Adventure,8,15,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,15/01/2023 19:55,ChefCook,Instagram,#Cooking #Food,12,25,Australia,2023,1,15,19


In [5]:
#Data Inspection and Exploration
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
727,False
728,False
729,False
730,False


In [6]:
#Check the data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  732 non-null    int64 
 1   Unnamed: 0    732 non-null    int64 
 2   Text          732 non-null    object
 3   Sentiment     732 non-null    object
 4   Timestamp     732 non-null    object
 5   User          732 non-null    object
 6   Platform      732 non-null    object
 7   Hashtags      732 non-null    object
 8   Retweets      732 non-null    int64 
 9   Likes         732 non-null    int64 
 10  Country       732 non-null    object
 11  Year          732 non-null    int64 
 12  Month         732 non-null    int64 
 13  Day           732 non-null    int64 
 14  Hour          732 non-null    int64 
dtypes: int64(8), object(7)
memory usage: 85.9+ KB


In [7]:
# Categorical columns
cat_col = [col for col in df.columns if df[col].dtype == 'object']
print('Categorical columns :',cat_col)
# Numerical columns
num_col = [col for col in df.columns if df[col].dtype != 'object']
print('Numerical columns :',num_col)

Categorical columns : ['Text', 'Sentiment', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Country']
Numerical columns : ['Unnamed: 0.1', 'Unnamed: 0', 'Retweets', 'Likes', 'Year', 'Month', 'Day', 'Hour']


In [None]:
 #Check the total number of Unique Values in the Categorical Columns
df[cat_col].nunique()

Unnamed: 0,0
Text,707
Sentiment,279
Timestamp,683
User,685
Platform,4
Hashtags,697
Country,115


In [None]:
#Column	            Unique Values	Likely Unique or Repetitive?	Notes
#Text	707	          Likely Unique	        Usually individual posts/messages.
#Sentiment	279	    Semi-repetitive	      Could be many sentiment scores (but not one per row).
#Timestamp	683	    Close to Unique	      May vary per post/user. Likely time of creation.
#User	685	          Likely Unique       	Likely user IDs or handles.
#Platform	4	        Repetitive          	e.g., iOS, Android, Web, etc.
#Hashtags	697	      High-cardinality	    Many combinations possible. Often messy.
#Country	115	      Repetitive	          Likely a manageable set of country codes/names.

In [None]:
df['User'].unique()[:50] #For the Text, Let’s first print the 50 unique Text.

array([' User123      ', ' CommuterX    ', ' FitnessFan   ',
       ' AdventureX   ', ' ChefCook     ', ' GratitudeNow ',
       ' RainyDays    ', ' MovieBuff    ', ' DebateTalk   ',
       ' BeachLover   ', ' BloggerX     ', ' WellnessCheck',
       ' UrbanExplorer', ' FitJourney   ', ' TechEnthusiast',
       ' Reflections  ', ' PetAdopter   ', ' GamerX       ',
       ' TechConference', ' WinterBlues  ', ' Bookworm     ',
       ' VRExplorer   ', ' ProductivityPro', ' FitnessWarrior',
       ' CareerMilestone', ' BrunchBuddy  ', ' LanguageLearner',
       ' BookLover    ', ' MentalHealthMatters', ' ArtistInAction',
       ' RoadTripper  ', ' SunsetWatcher', ' CodeEnthusiast',
       ' WorkshopAttendee', ' WinterSports  ', ' FamilyTime   ',
       ' MusicLover   ', ' MindfulMoments', ' DessertExplorer',
       ' GamingEnthusiast', ' GardenPlanner ', ' BirthdayBash ',
       ' ProductivityWin', ' MovieNight   ', ' ArtExplorer  ',
       ' BookwormX    ', ' VRMeetup     ', ' NatureLove

In [None]:
df1 = df.drop(columns=['Text', 'User']) #df.drop(columns=['Text', 'User']) removes those two columns.
df1.shape

(732, 13)

In [None]:
df1.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Sentiment,Timestamp,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Positive,15/01/2023 12:30,Twitter,#Nature #Park,15,30,USA,2023,1,15,12
1,1,1,Negative,15/01/2023 8:45,Twitter,#Traffic #Morning,5,10,Canada,2023,1,15,8
2,2,2,Positive,15/01/2023 15:45,Instagram,#Fitness #Workout,20,40,USA,2023,1,15,15
3,3,3,Positive,15/01/2023 18:20,Facebook,#Travel #Adventure,8,15,UK,2023,1,15,18
4,4,4,Neutral,15/01/2023 19:55,Instagram,#Cooking #Food,12,25,Australia,2023,1,15,19


# 732 rows and 13 columns

# **Handling Missing Data**

In [None]:
# Checking if there is (null value) in the column.
round((df1.isnull().sum()/df1.shape[0])*100,2)

Unnamed: 0,0
Unnamed: 0.1,0.0
Unnamed: 0,0.0
Sentiment,0.0
Timestamp,0.0
Platform,0.0
Hashtags,0.0
Retweets,0.0
Likes,0.0
Country,0.0
Year,0.0


It means there's no missing values



In [None]:
#Remove non-informative columns.
df2 = df1.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore')

In [None]:
df2.columns

Index(['Sentiment', 'Timestamp', 'Platform', 'Hashtags', 'Retweets', 'Likes',
       'Country', 'Year', 'Month', 'Day', 'Hour'],
      dtype='object')

In [None]:
df2.head()

Unnamed: 0,Sentiment,Timestamp,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,Positive,15/01/2023 12:30,Twitter,#Nature #Park,15,30,USA,2023,1,15,12
1,Negative,15/01/2023 8:45,Twitter,#Traffic #Morning,5,10,Canada,2023,1,15,8
2,Positive,15/01/2023 15:45,Instagram,#Fitness #Workout,20,40,USA,2023,1,15,15
3,Positive,15/01/2023 18:20,Facebook,#Travel #Adventure,8,15,UK,2023,1,15,18
4,Neutral,15/01/2023 19:55,Instagram,#Cooking #Food,12,25,Australia,2023,1,15,19



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
# Checking if there is (null value) in the column and we removed the non-informative columns
round((df2.isnull().sum()/df2.shape[0])*100,2)

Unnamed: 0,0
Sentiment,0.0
Timestamp,0.0
Platform,0.0
Hashtags,0.0
Retweets,0.0
Likes,0.0
Country,0.0
Year,0.0
Month,0.0
Day,0.0


In [None]:
df2 = df1.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore')

In [None]:
df2.columns

Index(['Sentiment', 'Timestamp', 'Platform', 'Hashtags', 'Retweets', 'Likes',
       'Country', 'Year', 'Month', 'Day', 'Hour'],
      dtype='object')

In [None]:
# As you can see we remove the specific unnamed 0 and 0.1
df2.shape

(732, 11)

In [None]:
# Month should be between 1 and 12
valid_months = df2['Month'].between(1, 12).all()

# Day should be between 1 and 31
valid_days = df2['Day'].between(1, 31).all()

# Hour should be between 0 and 23
valid_hours = df2['Hour'].between(0, 23).all()

# Likes and Retweets should be non-negative
valid_likes = (df2['Likes'] >= 0).all()
valid_retweets = (df2['Retweets'] >= 0).all()

In [None]:
df2.dtypes

Unnamed: 0,0
Sentiment,object
Timestamp,object
Platform,object
Hashtags,object
Retweets,int64
Likes,int64
Country,object
Year,int64
Month,int64
Day,int64
