In [1]:
import pandas as pd
pd.options.display.max_rows = 10
import numpy as np 
# pd.set_option('display.max_colwidth', None)
import os

In [2]:
import re
from spellchecker import SpellChecker
import spacy
from contractions import contractions_dict
from textblob import TextBlob
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Loadng English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Spell checker
spell = SpellChecker()

In [3]:
contraction_mapping = contractions_dict


def lower_dict(d):
    return {k.lower(): v.lower() for k, v in d.items()}

contraction_mapping = lower_dict(contractions_dict)

In [4]:
# now zipped in Summarization_and_preprocessig_dfs.zip 

df = pd.read_csv('s_and_h_preprocess_1.csv')
df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1, inplace=True)
# Save the original index in a new column
df['True_Index'] = df.index
df = df.reindex(columns=['True_Index','File_Index', 'episode_number', 'title', 'download_url',
       'publication_date', 'transcription_file', 'transcript',
       'generated_transcripts', 'cleaned_generated_text'])

# # Reset the index if you still need to, without dropping the current index
# df = df.reset_index(drop=True)

In [5]:
def expand_contractions(text, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match, 
                        contraction_mapping.get(match.lower(), match)) # Default to match if not found
        # Ensure the first character's case is preserved
        expanded_contraction = first_char + expanded_contraction[1:] if expanded_contraction else match
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    return expanded_text

spell_check_cache = {}

def correct_spellings_cached(text):
    words = text.split()
    corrected_words = []
    for word in words:
        if word not in spell_check_cache:
            corrected_word = spell.correction(word) if spell.correction(word) else word
            spell_check_cache[word] = corrected_word
        else:
            corrected_word = spell_check_cache[word]
        corrected_words.append(corrected_word)
    return ' '.join(corrected_words)

def correct_spellings(text):
    words = text.split()
    unique_words = set(words)
    # Perform spell correction only for unique words to enhance performance
    corrected_mapping = {word: spell.correction(word) if spell.correction(word) else word for word in unique_words}
    # Apply corrections to the entire text
    corrected_text = ' '.join(corrected_mapping.get(word, word) for word in words)
    return corrected_text

def remove_fillers(text):
    fillers = ['um', 'uh', 'you know', 'so', 'like']
    pattern = r'\b(?:' + '|'.join(fillers) + r')\b'
    return re.sub(pattern, '', text)

def normalize_whitespace_punctuation(text):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    # Fix common punctuation spacing issues
    text = re.sub(r'\s([?.!",](?:\s|$))', r'\1', text)
    return text.strip()

def preprocess_text(text, contractions_dict):
    # Lowercase conversion
    text = text.lower()
    # Remove fillers
    text = remove_fillers(text)
    # Expand Contractions
    text = expand_contractions(text, contractions_dict)
    # Normalize whitespace and punctuation
    text = normalize_whitespace_punctuation(text)
    # Correct spellings
    text = correct_spellings_cached(text)
    # Return preprocessed text
    return text
    


In [None]:
x = "I'm Dr. John Smith's groundbroking research on CRISPR gene editing at Harvard University wasn't that interesting"
y = "FROM W B EASY CHICAGO IT'S THIS AMERICAN LIFE I'M NANCY UPDIKE FILLING IN FOR IRA GLASS TI DA SHOW IS A RERUN A GOOD ON AND I'M GIN A START WITH THIS STORY THAT I WANT A SHARE IT'S LITTLE PERSONAL I WAS AT MAC THE MAKE UP STORE NOT THE COMPUTERS STORE AND I WAS BUYING FOUNDATION WHICH I ALMOST NEVER WEAR IT'S THE MAKE UP YOU PUT ALL OVER YOUR FACE TO GIVE YOURSELF THE PRETEND PERFECT SKIN AND I ASKED THE SALESMAN FOR HELP FINDING THE RIGHT COLOR AND HE LOOKED AT ME AND HE SAID ALMOST LIKE YOU WAS THINKING OUT LOUD HE SAID YOUR NECK IT'S SO MUCH MORE YELLOW THAN YOUR FACE AND THEN HE TURNED AWAY TO START LOOKING FOR THE IMPOSSIBLE COLOR THAT WOULD SOLVE THIS PROBLEM OF THE YELLOW WRIHT NEXT TO THE SO MUCH MORE YELLOW AND IF YOU'RE THINKING ALL THIS WAS JUST A SALE'S TECHNIQUE TO INVENT A PROBLEM AND THEN OFFERD TO FIX IT WITH MORE PRODUCTS I WISH THAT THAT HAD BEEN THE CASE BUT THIS WAS NOT AN UPSELL THIS WAS A CREE TO CUR THE MAN REALLY JUST SEEMED TO BE EXPRESSING HIS FRUSTRATION AT THIS STUMPER OF MY MISMATCHED FACE AND NECK"



# sample_file = '/Users/parthmalik/Desktop/GITHUB/Automated_podcast_sumarization__and_highlight_generation/BEN_GENERATED_TRANS/39.txt'

# with open(sample_file, 'r') as file:
#     sample = file.read()


# # print(preprocess_text(x, contractions_dict=contraction_mapping))
# print(preprocess_text(sample, contractions_dict=contraction_mapping))

print('\n')

In [6]:
output_dir = 'PREPROCESSING_2'
os.makedirs(output_dir, exist_ok=True)

# INCLUSIVE

# # Ben index 39 to 186  -> 0 to 147

# # TAL index 187 to 344 -> 148 to 305

# # Huberman index 345 to 543 -> 306  to  502


In [7]:

def process_and_save(df, start_index, end_index, contractions_dict, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    # Loop through the specified range of the DataFrame
    for i, (index, row) in enumerate(df.iloc[start_index:end_index].iterrows(), start=start_index):
        # Print which 'True_Index' and DataFrame's index is currently being processed
        print(f"Processing True_Index: {row['True_Index']} at DataFrame Index: {row['File_Index']}, Title: {row['title']},...")

        # Preprocess the text
        preprocessed_text = preprocess_text(row['generated_transcripts'], contractions_dict)
        
        # Define the output filename using 'True_Index'
        filename = f"{row['True_Index']}.txt"
        file_path = os.path.join(output_dir, filename)
        
        # Write the preprocessed text to a file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(preprocessed_text)
        
        # Print confirmation after processing
        print(f"Processed and saved: {filename} (True_Index: {row['True_Index']}, Title: {row['title']}, DataFrame Index: {'File_Index'})")





In [9]:
df[['True_Index', 'title']].iloc[306:]

Unnamed: 0,True_Index,title
306,306,How Placebo Effects Work to Change Our Biology...
307,307,"AMA #16: Sleep, Vertigo, TBI, OCD, Tips for Tr..."
308,308,Dr. Becky Kennedy: Protocols for Excellent Par...
309,309,Dr. Mark D'Esposito: How to Optimize Cognitive...
310,310,How to Improve Oral Health & Its Critical Role...
...,...,...
498,498,"How to Defeat Jetlag, Shift Work & Sleeplessness"
499,499,"Using Science to Optimize Sleep, Learning & Me..."
500,500,Be More Alert When Awake
501,501,How Your Brain Works & Changes


In [10]:
start_index = 306
end_index = 503 # excusive i.e will process tile ending index - 1 


process_and_save(df, start_index, end_index, contractions_dict, output_dir)

Processing True_Index: 306 at DataFrame Index: 345, Title: How Placebo Effects Work to Change Our Biology & Psychology,...
Processed and saved: 306.txt (True_Index: 306, Title: How Placebo Effects Work to Change Our Biology & Psychology, DataFrame Index: File_Index)
Processing True_Index: 307 at DataFrame Index: 346, Title: AMA #16: Sleep, Vertigo, TBI, OCD, Tips for Travelers, Gut-Brain Axis & More,...
Processed and saved: 307.txt (True_Index: 307, Title: AMA #16: Sleep, Vertigo, TBI, OCD, Tips for Travelers, Gut-Brain Axis & More, DataFrame Index: File_Index)
Processing True_Index: 308 at DataFrame Index: 347, Title: Dr. Becky Kennedy: Protocols for Excellent Parenting & Improving Relationships of All Kinds,...
Processed and saved: 308.txt (True_Index: 308, Title: Dr. Becky Kennedy: Protocols for Excellent Parenting & Improving Relationships of All Kinds, DataFrame Index: File_Index)
Processing True_Index: 309 at DataFrame Index: 348, Title: Dr. Mark D'Esposito: How to Optimize Cogni