In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('reddit_preprocessing.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [4]:
print("No.of Columns :- ", df.shape[0])
print("No.of Rows :-", df.shape[1])

No.of Columns :-  36793
No.of Rows :- 2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36793 entries, 0 to 36792
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  36662 non-null  object
 1   category       36793 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 575.0+ KB


Conclusion :
- There are some Null values present in our dataset.

### 1. HANDLING 'NULL' VALUES

In [6]:
df.isnull().sum()

clean_comment    131
category           0
dtype: int64

In [7]:
df[df['clean_comment'].isna()]

Unnamed: 0,clean_comment,category
287,,0
821,,0
922,,0
934,,0
1169,,0
...,...,...
36358,,0
36563,,0
36599,,0
36731,,0


In [8]:
df[df['clean_comment'].isna()]['category'].value_counts()

category
 0    128
-1      2
 1      1
Name: count, dtype: int64

CONCLUSION :
- The majority of rows with missing 'clean_comment' (128 out of 131) belong to category '0'.
- There are very few rows with missing 'clean_comment' in categories -1 (2 rows) and 1 (1 row).

In [9]:
df.dropna(inplace=True) 

- As the null values are not too much as compare the actual data so we can drop it.


### 2. HANDLING DUPLCATE VALUES :-

In [10]:
df.duplicated().sum()

np.int64(419)

In [12]:
df[df.duplicated()]

Unnamed: 0,clean_comment,category
1376,jpg,0
1397,good,1
1430,real bhagoda,1
1561,think,0
2340,cringe,0
...,...,...
36751,nice try,1
36768,vote,0
36770,would,0
36776,nice try,1


In [11]:
df[df.duplicated()]['category'].value_counts()

category
 0    267
 1    104
-1     48
Name: count, dtype: int64

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
df.duplicated().sum()

np.int64(0)

In [19]:
df[(df['clean_comment'].str.strip() == '')]

Unnamed: 0,clean_comment,category


In [20]:
df = df[~(df['clean_comment'].str.strip() == '')]

In [21]:
df['clean_comment'] = df['clean_comment'].str.lower()

In [23]:
df['clean_comment'].head()

0    family mormon never tried explain still stare ...
1    buddhism much lot compatible christianity espe...
2    seriously say thing first get complex explain ...
3    learned want teach different focus goal not wr...
4    benefit may want read living buddha living chr...
Name: clean_comment, dtype: object

In [24]:
df[df['clean_comment'].apply(lambda x: x.endswith(' ') or x.startswith(' '))]

Unnamed: 0,clean_comment,category


In [25]:
# Remove trailing and leading whitespaces from the 'clean_comment' column
df['clean_comment'] = df['clean_comment'].str.strip()

# Verify the transformation by checking for any remaining trailing whitespaces
df['clean_comment'].apply(lambda x: x.endswith(' ') or x.startswith(' ')).sum()

np.int64(0)

In [26]:
# Identify comments containing URLs
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
comments_with_urls = df[df['clean_comment'].str.contains(url_pattern, regex=True)]

# Display the comments containing URLs
comments_with_urls.head()


Unnamed: 0,clean_comment,category


In [27]:
# Identify comments containing new line characters
comments_with_newline = df[df['clean_comment'].str.contains('\n')]

# Display the comments containing new line characters
comments_with_newline.head()


Unnamed: 0,clean_comment,category


In [28]:
# Remove new line characters from the 'clean_comment' column
df['clean_comment'] = df['clean_comment'].str.replace('\n', ' ', regex=True)

# Verify the transformation by checking for any remaining new lines
comments_with_newline_remaining = df[df['clean_comment'].str.contains('\n')]
comments_with_newline_remaining


Unnamed: 0,clean_comment,category
