# Installations

In [None]:
!pip install nltk spacy emoji

In [None]:
!python -m spacy download en_core_web_sm

# Commit to GitHub

In [None]:
!pwd                # shows your current folder
!git status         # check uncommitted changes
!git add .
!git commit -m "Data text Preprocessing and cleaning"
!git push origin main

# Import Statements

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re
from nltk import ngrams
from textblob import TextBlob
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

from langdetect import detect
import langdetect

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tharu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Initial Preprocessing Steps

In [3]:
# Load dataset
df_posts = pd.read_csv("Final_Posts_Data.csv")
df_posts.head()

  df_posts = pd.read_csv("Final_Posts_Data.csv")


Unnamed: 0,type,source,keyword,id,created_utc,author,subreddit,content,score,num_comments,parent_post
0,post,new,,1otaemb,1762771000.0,Cookiehere6969,srilanka,Is this a Scam or good investment? Haritha Lan...,2.0,1.0,
1,post,new,,1otaam5,1762770000.0,oshan789,srilanka,Villa units for sale in Unawatuna Sri Lanka ! ...,3.0,0.0,
2,post,new,,1ot9w1v,1762769000.0,mgssjjsks,srilanka,Whats your hot take on Sri Lanka as the title ...,3.0,8.0,
3,post,new,,1ot9kwe,1762768000.0,Critical_Rise_exe,srilanka,Is the rs.11 deals real in Daraz?,1.0,3.0,
4,post,new,,1ot9h2f,1762767000.0,No-Leave8971,srilanka,Need advice from the experts üôè [](https://www....,2.0,0.0,


In [4]:
# Dataset overview
print("Dataset shape:", df_posts.shape)

Dataset shape: (70861, 11)


In [5]:
# Check for duplicates based on a specific column
duplicates_name = df_posts.duplicated(subset=['content'])
print("\nDuplicates based on 'content':")
print(df_posts[duplicates_name])


Duplicates based on 'content':
Empty DataFrame
Columns: [type, source, keyword, id, created_utc, author, subreddit, content, score, num_comments, parent_post]
Index: []


In [6]:
# Missing Values
missing_counts = df_posts.isnull().sum()
print("Missing values per column:\n", missing_counts)

Missing values per column:
 type                0
source              0
keyword         53250
id                  0
created_utc         0
author           2981
subreddit           0
content             1
score               8
num_comments    52021
parent_post     18848
dtype: int64


In [7]:
# Missing Values
missing_counts = df_posts.isnull().sum()
print("Missing values per column:\n", missing_counts) 

Missing values per column:
 type                0
source              0
keyword         53250
id                  0
created_utc         0
author           2981
subreddit           0
content             1
score               8
num_comments    52021
parent_post     18848
dtype: int64


In [8]:
# Fill 'keyword' missing values
df_posts['keyword'] = df_posts['keyword'].fillna('no keyword')

# Fill 'author' missing values
df_posts['author'] = df_posts['author'].fillna('no author')

# Fill 'score' missing values with the median
median_score = df_posts['score'].median()
df_posts['score'] = df_posts['score'].fillna(median_score)

# Fill 'num_comments' missing values with 0 only where source == 'comments'
mask = df_posts['source'] == 'comments'
df_posts.loc[mask, 'num_comments'] = df_posts.loc[mask, 'num_comments'].fillna(0)

# Fill 'parent_post' missing values
df_posts['parent_post'] = df_posts['parent_post'].fillna('no post')

# Drop rows where 'content' is missing (only 1 row)
df_posts = df_posts.dropna(subset=['content'])
# reset the index
df_posts.reset_index(drop=True, inplace=True)

missing_counts = df_posts.isnull().sum()
print("Missing values per column:\n", missing_counts) 

print("\n Dataset shape:", df_posts.shape)

df_posts.head()

Missing values per column:
 type                0
source              0
keyword             0
id                  0
created_utc         0
author              0
subreddit           0
content             0
score               0
num_comments    52020
parent_post         0
dtype: int64

 Dataset shape: (70860, 11)


Unnamed: 0,type,source,keyword,id,created_utc,author,subreddit,content,score,num_comments,parent_post
0,post,new,no keyword,1otaemb,1762771000.0,Cookiehere6969,srilanka,Is this a Scam or good investment? Haritha Lan...,2.0,1.0,no post
1,post,new,no keyword,1otaam5,1762770000.0,oshan789,srilanka,Villa units for sale in Unawatuna Sri Lanka ! ...,3.0,0.0,no post
2,post,new,no keyword,1ot9w1v,1762769000.0,mgssjjsks,srilanka,Whats your hot take on Sri Lanka as the title ...,3.0,8.0,no post
3,post,new,no keyword,1ot9kwe,1762768000.0,Critical_Rise_exe,srilanka,Is the rs.11 deals real in Daraz?,1.0,3.0,no post
4,post,new,no keyword,1ot9h2f,1762767000.0,No-Leave8971,srilanka,Need advice from the experts üôè [](https://www....,2.0,0.0,no post


In [9]:
# converting date column into a readable format
df_posts['created_date'] = pd.to_datetime(df_posts['created_utc'], unit='s').dt.date
df_posts['created_time'] = pd.to_datetime(df_posts['created_utc'], unit='s').dt.time

# Drop the original 'created_utc' column
df_posts.drop(columns=['created_utc'], inplace=True)
df_posts


Unnamed: 0,type,source,keyword,id,author,subreddit,content,score,num_comments,parent_post,created_date,created_time
0,post,new,no keyword,1otaemb,Cookiehere6969,srilanka,Is this a Scam or good investment? Haritha Lan...,2.0,1.0,no post,2025-11-10,10:33:16
1,post,new,no keyword,1otaam5,oshan789,srilanka,Villa units for sale in Unawatuna Sri Lanka ! ...,3.0,0.0,no post,2025-11-10,10:26:02
2,post,new,no keyword,1ot9w1v,mgssjjsks,srilanka,Whats your hot take on Sri Lanka as the title ...,3.0,8.0,no post,2025-11-10,10:00:29
3,post,new,no keyword,1ot9kwe,Critical_Rise_exe,srilanka,Is the rs.11 deals real in Daraz?,1.0,3.0,no post,2025-11-10,09:40:57
4,post,new,no keyword,1ot9h2f,No-Leave8971,srilanka,Need advice from the experts üôè [](https://www....,2.0,0.0,no post,2025-11-10,09:33:57
...,...,...,...,...,...,...,...,...,...,...,...,...
70855,img_post,url,no keyword,1ozebo7,Dhanagg,srilanka,NEWSWIRE\n\nSri Lanka flags\noutside Rawalpind...,3.0,,no post,2025-11-17,11:55:19
70856,img_post,url,no keyword,1ozdn89,Unreal_realist-7381,srilanka,DOT STUDIOS PRESENTS\nPASAN DOMINIC HASALAKA T...,3.0,,no post,2025-11-17,11:16:00
70857,img_post,url,no keyword,1ozdi55,wiknew1,srilanka,sarasavi fi Q\n\nTHE BOOKSHOP\nOL LIST SARASAV...,3.0,,no post,2025-11-17,11:07:48
70858,img_post,url,no keyword,1ozbed2,smllcheeseburger,srilanka,Lamborghini Urus Twin turbo V8 2025\n\nPosted ...,3.0,,no post,2025-11-17,08:54:42


### Drop non-english data (sinhala and tamil)

In [11]:
def detect_language(text):
    try:
        return detect(text)
    except langdetect.lang_detect_exception.LangDetectException:
        return 'unknown'

# Create a new column for language
df_posts['language'] = df_posts['content'].apply(detect_language)

<bound method Series.count of 0        en
1        en
2        en
3        en
4        en
         ..
70855    id
70856    en
70857    en
70858    en
70859    et
Name: language, Length: 70860, dtype: object>

In [18]:
# Keep only English posts
df_posts = df_posts[df_posts['language'] == 'en'].copy()

# Drop the language column
df_posts.drop(columns=['language'], inplace=True)
print(df_posts.shape)

df_posts.head()

(69773, 12)


Unnamed: 0,type,source,keyword,id,author,subreddit,content,score,num_comments,parent_post,created_date,created_time
0,post,new,no keyword,1otaemb,Cookiehere6969,srilanka,Is this a Scam or good investment? Haritha Lan...,2.0,1.0,no post,2025-11-10,10:33:16
1,post,new,no keyword,1otaam5,oshan789,srilanka,Villa units for sale in Unawatuna Sri Lanka ! ...,3.0,0.0,no post,2025-11-10,10:26:02
2,post,new,no keyword,1ot9w1v,mgssjjsks,srilanka,Whats your hot take on Sri Lanka as the title ...,3.0,8.0,no post,2025-11-10,10:00:29
3,post,new,no keyword,1ot9kwe,Critical_Rise_exe,srilanka,Is the rs.11 deals real in Daraz?,1.0,3.0,no post,2025-11-10,09:40:57
4,post,new,no keyword,1ot9h2f,No-Leave8971,srilanka,Need advice from the experts üôè [](https://www....,2.0,0.0,no post,2025-11-10,09:33:57


### **1. Language Filtering**

* [‚úîÔ∏è] Detect language of each post.
* [‚úîÔ∏è] Remove posts in Sinhala, Tamil, or any non-English languages.

---

### **2. Text Normalization**

* [ ] Convert all text to lowercase.
* [ ] Strip leading and trailing whitespaces.
* [ ] Normalize unicode characters (optional, e.g., accented letters).

---

### **3. Removing Irrelevant Content**

* [ ] Remove URLs (e.g., `http://`, `https://`, `www.`).
* [ ] Remove emojis and emoticons.
* [ ] Remove platform-specific metadata (e.g., `reddit`, `u/username`, `r/subreddit`, `comments`).
* [ ] Remove escape sequences (e.g., `\n`, `\t`).

---

### **4. Handling Special Characters**

* [ ] Remove punctuation marks (`!`, `.`, `,`, `?`, etc.).
* [ ] Remove other non-alphanumeric characters (`@`, `#`, `%`, `^`, etc.).
* [ ] Optionally remove numbers (if not needed for analysis).

---

### **5. Tokenization and Text Structuring**

* [ ] Tokenize text into words (if needed for further analysis).
* [ ] Optionally remove stopwords (`the`, `is`, `and`, etc.).
* [ ] Optionally lemmatize or stem words.

---

### **6. Filtering by Length**

* [ ] Remove posts with fewer than 25 words.
* [ ] Remove posts with more than 1000 words.

---

### **7. Corpus Analysis**

* [ ] Calculate the total number of words in the cleaned dataset.
* [ ] Calculate the number of unique words in the cleaned dataset.

---

‚úÖ **Optional / Advanced Steps**

* [ ] Handle duplicate posts if any.
* [ ] Correct common typos or spelling errors.
* [ ] Normalize spacing between words.

---

If you want, I can also make a **ready-to-use Python pipeline** that implements **all these steps in one go** for your `content` column and reports corpus size and unique words automatically.

Do you want me to do that?


In [None]:
# Install required packages if not already installed
# !pip install nltk spacy emoji
# !python -m spacy download en_core_web_sm

import pandas as pd
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy

# Load English model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define stopwords
stop_words = set(stopwords.words('english'))

# Function to replace emojis with textual description
def emoji_to_text(text):
    return emoji.demojize(text, delimiters=(" ", " "))

# Function to clean text
def clean_text(text):
    # 1. Convert emojis to text
    text = emoji_to_text(text)
    
    # 2. Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # 3. Remove Reddit platform metadata (u/username, r/subreddit)
    text = re.sub(r'u\/\w+', '', text)
    text = re.sub(r'r\/\w+', '', text)
    
    # 4. Remove escape sequences and extra whitespace
    text = text.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')
    text = re.sub(r'\s+', ' ', text)
    
    # 5. Remove punctuation (except within words like can't, won't)
    text = re.sub(r'[^\w\s\']', '', text)
    
    # 6. Lowercase
    text = text.lower()
    
    # 7. Tokenization
    tokens = word_tokenize(text)
    
    # 8. Remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    
    # 9. Lemmatization
    doc = nlp(" ".join(tokens))
    lemmatized = [token.lemma_ for token in doc]
    
    # 10. Join back into string
    cleaned_text = " ".join(lemmatized)
    
    return cleaned_text

# Apply cleaning function
df_posts['content_cleaned'] = df_posts['content'].astype(str).apply(clean_text)

# 11. Filter posts by word count
min_words = 25
max_words = 1000

def word_count_filter(text):
    count = len(text.split())
    return min_words <= count <= max_words

df_posts = df_posts[df_posts['content_cleaned'].apply(word_count_filter)]

# 12. Report corpus statistics
all_text = " ".join(df_posts['content_cleaned'])
all_words = all_text.split()
total_words = len(all_words)
unique_words = len(set(all_words))

print(f"Total words in cleaned corpus: {total_words}")
print(f"Unique words in cleaned corpus: {unique_words}")
