In [3]:
import pandas as pd
import re

In [5]:
df = pd.read_csv("Data.csv", header=None, encoding="ISO-8859-1")

# Rename columns for easier handling
df.columns = ["target", "id", "date", "flag", "user", "text"]

# Step 1: Check for missing values in each column
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

In [15]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,title,score,id,url,comms_num,created,body,year-month-day,hour-min-sec
1,Monkeypox may be on its way to a school near you,101,wtgjsb,https://thehill.com/opinion/healthcare/3607622...,33,1661026996,,2022-08-21,4:23:16
2,Purpose of Home Disinfection,29,wt923l,https://www.cdc.gov/poxvirus/monkeypox/specifi...,18,1661007372,,2022-08-20,22:56:12
3,Lack of âhigh qualityâ clinical guidelines...,45,wsu8pe,https://thehill.com/changing-america/well-bein...,2,1660957502,,2022-08-20,9:05:02
4,California issues monkeypox quarantine guidelines,135,wsn73d,https://www.kron4.com/monkeypox/california-iss...,31,1660938749,,2022-08-20,3:52:29


In [5]:
df = df.dropna()

# Step 3: Remove duplicates (if any)
df = df.drop_duplicates()

# Step 4: Fix incorrect data types
df["target"] = df["target"].astype(int)  # Assuming target should be an integer
df["id"] = df["id"].astype('int64')  # Assuming id should be an integer

# Step 5: Convert 'date' column to datetime
df["date"] = pd.to_datetime(df["date"], format="%a %b %d %H:%M:%S PDT %Y", errors='coerce')

# Step 6: Drop unnecessary columns
df = df.drop(columns=["flag"])

In [7]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabet characters
    return text.lower()  # Convert to lowercase

In [9]:
# Apply the text cleaning function to the 'text' column
df["text"] = df["text"].apply(clean_text)

# Step 8: Remove rows where the 'text' column is empty
df = df[df["text"].str.strip() != ""]

# Step 9: Check the cleaned dataset
print("Cleaned DataFrame (first 5 rows):\n", df.head())

Cleaned DataFrame (first 5 rows):
    target          id                date             user  \
0       0  1467810369 2009-04-06 22:19:45  _TheSpecialOne_   
1       0  1467810672 2009-04-06 22:19:49    scotthamilton   
2       0  1467810917 2009-04-06 22:19:53         mattycus   
3       0  1467811184 2009-04-06 22:19:57          ElleCTF   
4       0  1467811193 2009-04-06 22:19:57           Karoli   

                                                text  
0  switchfoot   awww thats a bummer  you shoulda ...  
1  is upset that he cant update his facebook by t...  
2  kenichan i dived many times for the ball manag...  
3    my whole body feels itchy and like its on fire   
4  nationwideclass no its not behaving at all im ...  


In [11]:
df.to_csv("Cleaned_Data.csv", index=False)


AttributeError: 'Expr' object has no attribute 'apply'