In [1]:
import pandas as pd
import numpy as np

In [11]:
file_path = 'dataset.csv'

In [12]:
df = pd.read_csv(file_path)
print("Dataset Overview")
print(df.info)

Dataset Overview
<bound method DataFrame.info of       comments  likes  shares  \
0          161   3594       9   
1          382  10020     236   
2           97   2197       4   
3         1605  39370     154   
4          492  12679      98   
...        ...    ...     ...   
1995       604  13506     898   
1996        39   2000      14   
1997        63   4470      44   
1998        12    272       1   
1999        33    735       6   

                                                   text  
0     A busy road to ICC Men's #T20WorldCup 2024 for...  
1     Sri Lanka level the series 1-1 after a stellar...  
2     Dimuth Karunaratne believes Sri Lanka did well...  
3     Shakib Al Hasan is back playing Test cricket a...  
4     Brilliant knocks from Dimuth Karunaratne and K...  
...                                                 ...  
1995  Chamari Athapaththu leads a powerful ICC Women...  
1996  Jack Brassell had the ball on a string 💫\n\n#U...  
1997  Jack Brassell swings the b

In [13]:
print("First few rows of dataset:")
print(df.head())

First few rows of dataset:
   comments  likes  shares                                               text
0       161   3594       9  A busy road to ICC Men's #T20WorldCup 2024 for...
1       382  10020     236  Sri Lanka level the series 1-1 after a stellar...
2        97   2197       4  Dimuth Karunaratne believes Sri Lanka did well...
3      1605  39370     154  Shakib Al Hasan is back playing Test cricket a...
4       492  12679      98  Brilliant knocks from Dimuth Karunaratne and K...


In [14]:
# 1. Handling missing values
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Drop rows with missing values in the 'text' column
df.dropna(subset=['text'], inplace=True)

# Check if missing values were handled
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values in each column:
comments    0
likes       0
shares      0
text        4
dtype: int64

Missing values after handling:
comments    0
likes       0
shares      0
text        0
dtype: int64


In [15]:
# 2. Remove duplicate rows
# Number of rows initially without cleaning
print("\nNumber of rows before removing duplicates:", df.shape[0])

# Remove duplicate rows
df.drop_duplicates(inplace=True)
print("\nNumber of rows after removing duplicates:", df.shape[0])


Number of rows before removing duplicates: 1996

Number of rows after removing duplicates: 1996


In [16]:
# 3. Standardizing text columns (e.g., the 'text' column)
df['text'] = df['text'].str.strip() # Remove extra spaces from the 'text' column
df['text'] = df['text'].str.lower() # Convert the 'text' column to lowercase

In [17]:
# 4. Handling outliers (if any)
threshold = 1000
df = df[df['likes'] <= threshold]
df.drop(columns=['comments', 'shares'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['comments', 'shares'], inplace=True)


In [18]:
# 7. Check data types after cleaning
print("\nData types after cleaning:")
print(df.dtypes)


Data types after cleaning:
likes     int64
text     object
dtype: object


In [19]:
# Save the cleaned data to a new CSV file
df.to_csv('icc_fb_page_cleaned.csv', index=False)
print("\nData cleaning complete. Cleaned data saved as 'icc_fb_page_cleaned.csv'.")


Data cleaning complete. Cleaned data saved as 'icc_fb_page_cleaned.csv'.


In [22]:
print(df.head())

    likes                                               text
6     725  three european teams will compete in an exciti...
29    337  england and new zealand hit with injury concer...
36    748  an historic day for namibia as their women’s c...
37    504  england will be without a key component of the...
53    390  in a historic first, 10 players of the namibia...
