# Data Cleaning

In [None]:
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

def clean_text(text):
    if not text:
        return ''
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\W+', ' ', text)
    return text

def preprocess_text(text):
    if not text:
        return ''
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

def tokenize_text(text):
    if not text:
        return []
    return text.split()

def remove_stopwords(words_list):
    if not words_list:
        return []
    stop_words = set(stopwords.words('english'))
    return [word for word in words_list if word.lower() not in stop_words]

def remove_short_words(words_list):
    if not words_list:
        return []
    return [word for word in words_list if len(word) > 1]

def lemmatize_words(words_list):
    if not words_list:
        return []
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words_list if word]
    return lemmatized_words

def process_samples(input_file, output_file):
    with open(input_file, 'r') as file:
        samples = json.load(file)

    processed_samples = []
    unique_samples = set()

    for sample in samples:
        body_text = sample.get('body', '')
        body_text = clean_text(body_text)
        body_text = preprocess_text(body_text)
        words_list = tokenize_text(body_text)
        words_list = remove_stopwords(words_list)
        words_list = lemmatize_words(words_list)
        words_list = remove_short_words(words_list)
        words_list = [word for word in words_list if word is not None]
        processed_text = ' '.join(words_list)

        if not processed_text.strip():
            continue

        if processed_text in unique_samples:
            continue

        unique_samples.add(processed_text)
        sample['body'] = processed_text
        processed_samples.append(sample)

    with open(output_file, 'w') as file:
        json.dump(processed_samples, file, indent=4)

    print(f"Sample size after preprocessing: {len(processed_samples)}")

input_file_path = '/content/drive/My Drive/deepfake/Deepfake_orignal.json'
output_file_path = '/content/drive/My Drive/deepfake/Processed_deepfake.json'

process_samples(input_file_path, output_file_path)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Sample size after preprocessing: 17720
