In [2]:
import pandas as pd




In [7]:
df = pd.read_csv("../data/raw/sentiment_dataset.csv")
df = df.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])


print(df.head())
print(df.info())


                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! ðŸ’ª          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   ChefCook        Instagram    

                                     Hashtags  Retweets  Likes       Country  \
0   #Nature #Park                                  15.0   30.0     USA         
1   #Traffic #Morning                               5.0   10.0     Canada      
2   #Fitnes

In [8]:
#Check for missing values
print(df.isnull().sum())


Text         0
Sentiment    0
Timestamp    0
User         0
Platform     0
Hashtags     0
Retweets     0
Likes        0
Country      0
Year         0
Month        0
Day          0
Hour         0
dtype: int64


In [9]:
#Remove duplicates
df = df.drop_duplicates()


In [None]:
#It removes duplicate rows from your DataFrame.

In [13]:
print("Duplicates after cleaning:", df.duplicated().sum())
print(df.shape)   # rows, columns count
print(df.head())  # first few rows


Duplicates after cleaning: 1
(712, 13)
                                                Text Sentiment  \
0   Enjoying a beautiful day at the park!        ...  Positive   
1   Traffic was terrible this morning.           ...  Negative   
2   Just finished an amazing workout! ðŸ’ª          ...  Positive   
3   Excited about the upcoming weekend getaway!  ...  Positive   
4   Trying out a new recipe for dinner tonight.  ...   Neutral   

            Timestamp        User   Platform  \
0 2023-01-15 12:30:00     User123    Twitter   
1 2023-01-15 08:45:00   CommuterX    Twitter   
2 2023-01-15 15:45:00  FitnessFan  Instagram   
3 2023-01-15 18:20:00  AdventureX   Facebook   
4 2023-01-15 19:55:00    ChefCook  Instagram   

                                     Hashtags  Retweets  Likes    Country  \
0   #Nature #Park                                    15     30        USA   
1   #Traffic #Morning                                 5     10     Canada   
2   #Fitness #Workout                    

In [10]:
#Standardize data types
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
df["Retweets"] = df["Retweets"].astype(int)
df["Likes"] = df["Likes"].astype(int)


In [14]:
#confirm the changes
print(df.dtypes)   # check all column data types
print(df.head())   # preview first few rows


Text                 object
Sentiment            object
Timestamp    datetime64[ns]
User                 object
Platform             object
Hashtags             object
Retweets              int32
Likes                 int32
Country              object
Year                  int64
Month                 int64
Day                   int64
Hour                  int64
dtype: object
                                                Text Sentiment  \
0   Enjoying a beautiful day at the park!        ...  Positive   
1   Traffic was terrible this morning.           ...  Negative   
2   Just finished an amazing workout! ðŸ’ª          ...  Positive   
3   Excited about the upcoming weekend getaway!  ...  Positive   
4   Trying out a new recipe for dinner tonight.  ...   Neutral   

            Timestamp        User   Platform  \
0 2023-01-15 12:30:00     User123    Twitter   
1 2023-01-15 08:45:00   CommuterX    Twitter   
2 2023-01-15 15:45:00  FitnessFan  Instagram   
3 2023-01-15 18:20:00  Adventu

In [15]:
#Strip spaces from categorical columns
for col in ["Sentiment", "User", "Platform", "Country"]:
    df[col] = df[col].str.strip()


In [16]:
#it updated the DataFrame.
print(df[["Sentiment", "User", "Platform", "Country"]].head(10))
print(df["Sentiment"].unique())


  Sentiment          User   Platform    Country
0  Positive       User123    Twitter        USA
1  Negative     CommuterX    Twitter     Canada
2  Positive    FitnessFan  Instagram        USA
3  Positive    AdventureX   Facebook         UK
4   Neutral      ChefCook  Instagram  Australia
5  Positive  GratitudeNow    Twitter      India
6  Positive     RainyDays   Facebook     Canada
7  Positive     MovieBuff  Instagram        USA
8  Negative    DebateTalk    Twitter        USA
9   Neutral    BeachLover   Facebook  Australia
['Positive' 'Negative' 'Neutral' 'Anger' 'Fear' 'Sadness' 'Disgust'
 'Happiness' 'Joy' 'Love' 'Amusement' 'Enjoyment' 'Admiration' 'Affection'
 'Awe' 'Disappointed' 'Surprise' 'Acceptance' 'Adoration' 'Anticipation'
 'Bitter' 'Calmness' 'Confusion' 'Excitement' 'Kind' 'Pride' 'Shame'
 'Elation' 'Euphoria' 'Contentment' 'Serenity' 'Gratitude' 'Hope'
 'Empowerment' 'Compassion' 'Tenderness' 'Arousal' 'Enthusiasm'
 'Fulfillment' 'Reverence' 'Despair' 'Grief' 'Loneliness'

In [17]:
print("\nFinal dataset info:")
print(df.info())
print("\nSample data:")
print(df.head())


Final dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 731
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Text       712 non-null    object        
 1   Sentiment  712 non-null    object        
 2   Timestamp  712 non-null    datetime64[ns]
 3   User       712 non-null    object        
 4   Platform   712 non-null    object        
 5   Hashtags   712 non-null    object        
 6   Retweets   712 non-null    int32         
 7   Likes      712 non-null    int32         
 8   Country    712 non-null    object        
 9   Year       712 non-null    int64         
 10  Month      712 non-null    int64         
 11  Day        712 non-null    int64         
 12  Hour       712 non-null    int64         
dtypes: datetime64[ns](1), int32(2), int64(4), object(6)
memory usage: 72.3+ KB
None

Sample data:
                                                Text Sentiment  \
0   Enjoy

In [12]:
#Save the cleaned dataset
df.to_csv("../data/processed/sentiment_dataset_cleaned.csv", index=False)
print("Sentiment dataset cleaned and saved successfully!")


Sentiment dataset cleaned and saved successfully!


In [18]:
df.to_csv("../data/processed/sentiment_dataset_cleaned.csv", index=False)
print("Sentiment dataset cleaned and saved successfully!")


Sentiment dataset cleaned and saved successfully!


In [None]:
# Load dataset â†’ 712 rows, 13 columns.
# Checked for missing values â†’ none found.
# Removed duplicates â†’ cleaned dataset retained 712 rows.
# Standardized data types â†’ converted Timestamp to datetime, Retweets & Likes to int, others to proper types.
# Stripped spaces in categorical columns â†’ fixed inconsistent categories.
# Confirmed unique Sentiment values â†’ wide variety of emotions/labels.
# Final dataset info & sample â†’ clean structure, correct dtypes.
# Saved cleaned dataset â†’ ready for analysis/modeling.