In [44]:
import pandas as pd
import numpy as np


# Unusual or Irrelevant Columns: **(can be part of noisy data)**
 There are columns named Unnamed: 7, Unnamed: 8, and Unnamed: 9 with very few non-null values and likely irrelevant data. These columns might be errors in data entry or
 #**extraction**
 and typically do not contain useful information.

In [51]:
data = pd.read_csv('Final_merged.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24446 entries, 0 to 24445
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         24444 non-null  object
 1   score         24443 non-null  object
 2   subreddit     24443 non-null  object
 3   url           24443 non-null  object
 4   created_utc   24443 non-null  object
 5   num_comments  24443 non-null  object
 6   label         24443 non-null  object
 7   Unnamed: 7    3 non-null      object
 8   Unnamed: 8    3 non-null      object
 9   Unnamed: 9    1 non-null      object
dtypes: object(10)
memory usage: 1.9+ MB


# **Nulll Values (can be part of missing values)**

In [45]:
data = pd.read_csv('Final_merged.csv')

data.drop(columns=['Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9'], inplace=True)

null_values = data.isnull().sum()
print("Null values in each column:")
print(null_values)

Null values in each column:
title           2
score           3
subreddit       3
url             3
created_utc     3
num_comments    3
label           3
dtype: int64


In [46]:
data.dropna(inplace=True)
null_values = data.isnull().sum()
print("Null values in each column:")
print(null_values)

Null values in each column:
title           0
score           0
subreddit       0
url             0
created_utc     0
num_comments    0
label           0
dtype: int64


In [47]:
# Check for duplicates based on 'title' and 'subreddit' columns
duplicate_rows = data[data.duplicated(subset=['title', 'subreddit'],keep=False)]

# Get the number of duplicate rows
num_duplicate_rows = duplicate_rows.shape[0]
print("Total number of duplicate rows (based only on 'title' and 'subreddit'):", num_duplicate_rows)

# Take a sample of duplicate rows for inspection
duplicate_rows_sample = duplicate_rows.head()  # Take a sample of up to 5 rows
print("\nSample of duplicate rows (based only on 'title' and 'subreddit'):")
print(duplicate_rows_sample)

Total number of duplicate rows (based only on 'title' and 'subreddit'): 3235

Sample of duplicate rows (based only on 'title' and 'subreddit'):
                                                 title score   subreddit  \
434  Tyrone Scott announces Green Party of England ...    10  GreenParty   
443  Tyrone Scott announces Green Party of England ...     5  GreenParty   
654  Green Party Presidential Candidate Cornel West...    27  GreenParty   
676  Green Party Presidential Candidate Cornel West...    12  GreenParty   
915  Pretty precise description of Israel's style o...    42  GreenParty   

                                                   url    created_utc  \
434  https://bright-green.org/2022/06/23/exclusive-...  6/27/22 13:38   
443  https://bright-green.org/2022/06/23/exclusive-...  6/23/22 16:52   
654  https://youtube.com/watch?v=WHLFTQyX9NA&featur...   7/23/23 1:05   
676  https://youtube.com/watch?v=daN0sSIsZGI&featur...  7/14/23 17:18   
915               https://i.redd.i

# **Duplicate values (can be part of noisy data)**

In [48]:
data = data.drop_duplicates(subset=['title', 'subreddit'], keep='first')

In [49]:
# Check for duplicates based on 'title' and 'subreddit' columns
duplicate_rows = data[data.duplicated(subset=['title', 'subreddit'])]

# Get the number of duplicate rows
num_duplicate_rows = duplicate_rows.shape[0]
print("Total number of duplicate rows (based only on 'title' and 'subreddit'):", num_duplicate_rows)

Total number of duplicate rows (based only on 'title' and 'subreddit'): 0
