In [1]:
! pip install pandas langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     -------------------- ----------------- 524.3/981.5 kB 1.8 MB/s eta 0:00:01
     -------------------------------------- 981.5/981.5 kB 1.9 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (pyproject.toml): started
  Building wheel for langdetect (pyproject.toml): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993363 sha256=f7add39e18f7fa

In [2]:
import pandas as pd
import re
from langdetect import detect, DetectorFactory

In [3]:
# Ensure consistent results from langdetect
DetectorFactory.seed = 0


In [5]:
# Load the dataset
file_path = r"C:\Users\remya\OneDrive\Desktop\final_project\final_comments.csv"  # Update this if needed
df = pd.read_csv(file_path)

In [11]:
# Assuming the comments are in a column named 'comment'
comment_column = 'text'  # Update if the column name is different

In [12]:
# Function to check if a comment is English
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False

In [13]:
# Function to detect non-English words in Latin script (Hindi typed in English)
def contains_non_english_latin(text):
    hindi_words = ["h", "ki", "m", "hn", "p", "k", "n", "bhi", "ka", "ha", "to", "ke", "se"]
    words = text.lower().split()
    non_english_count = sum(1 for word in words if word in hindi_words)
    return non_english_count > len(words) * 0.3  # If 30% or more words are non-English


In [14]:
# Function to clean comments
def clean_comments(df, comment_col):
    # Drop rows where the comment is NaN or empty
    df = df.dropna(subset=[comment_col])

    # Remove very long comments (adjustable length threshold)
    df = df[df[comment_col].str.len() < 300]

    # Keep only English comments
    df = df[df[comment_col].apply(is_english)]

    # Remove comments containing Hindi written in Latin script
    df = df[~df[comment_col].apply(contains_non_english_latin)]

    # Remove comments containing special characters like # and @
    df = df[~df[comment_col].str.contains(r"[@#]", regex=True)]

    return df

In [15]:
# Apply the cleaning function
filtered_df = clean_comments(df, comment_column)


In [17]:
# Save the cleaned data to a new CSV file
filtered_file_path = r"C:\Users\remya\OneDrive\Desktop\final_project\preprocessed_comments.csv"
filtered_df.to_csv(filtered_file_path, index=False)

In [18]:
print(f"Filtered comments saved to: {filtered_file_path}")

Filtered comments saved to: C:\Users\remya\OneDrive\Desktop\final_project\preprocessed_comments.csv
