In [None]:
import unicodedata
import pandas as pd
import re

class TweetDataLoader:
    def init(self, data_folder):
        self.data_folder = data_folder
        self.total_link_count = 0
        self.total_word_with_num_count = 0
        self.total_whitespace_removed_count = 0
        self.total_punctuation_removed_count = 0
        self.total_non_english_alphabet_removed_count = 0
    
    def load_data(self, file_name):
        file_path = f"{self.data_folder}/{file_name}"
        try:
            df = pd.read_csv(file_path, sep='\\t')
            return df
        except pd.errors.ParserError as e:
            print(f"Error parsing file {file_path}: {e}")
            print("Attempting to skip problematic lines and continue loading...")
            df = self.skip_problematic_lines(file_path)
            return df

def skip_problematic_lines(self, file_path):
    lines = []
    with open(file_path, 'r', encoding='utf-8') as file:
        first_line = file.readline().strip().split('\\t')
        for line in file:
            try:
                fields = line.strip().split('\\t')
                if len(fields) == len(first_line):  # Assuming all lines have the same number of fields as the first line
                    lines.append(fields)
                else:
                    print(f"Skipping line with unexpected number of fields: {line.strip()}")
            except Exception as e:
                print(f"Error processing line: {line.strip()}. Skipping...")
                print(f"Error details: {e}")
    df = pd.DataFrame(lines, columns=first_line)
    return df

def preprocess_data(self, df):
    # Preprocess the text in the 'Text' column
    df['Text'] = df['Text'].apply(lambda text: self.preprocess_text(text))

    # Calculate and print total counts
    self.print_total_counts()

    null_values_text = df['Text'].isnull().sum()
    null_values_Sentence_id = df['Sentence_id'].isnull().sum()
    null_values_cclass_label = df['class_label'].isnull().sum()

# Print information about null values
    print(f"Null values in 'Text' column after preprocessing: {null_values_text}")
    print(f"Null values in 'Sentence_id' column after preprocessing: {null_values_Sentence_id}")
    print(f"Null values in 'class_label' column after preprocessing: {null_values_cclass_label}")



    return df

def preprocess_text(self, text):
# Initialize counts
    capital_word_count = 0
    link_count = 0
    word_with_num_count = 0
    whitespace_removed_count = 0
    punctuation_removed_count = 0
    non_english_alphabet_removed_count = 0

    # Remove brackets and content within them
    text = re.sub(r'\\([^)]*\\)', '', text)
    # Convert text to lowercase
    text_lower = text.lower()
    # Remove links and count them
    text, link_count = re.subn(r'https?:\\/\\/\\S+', '', text)
    # Remove punctuation and count them
    text, punctuation_removed_count = re.subn(r'[^\\w\\s]', '', text)
    # Remove words containing numbers and count them
    text, word_with_num_count = re.subn(r'\\w*\\d\\w*', '', text)
    # Remove extra whitespaces and count them
    text, whitespace_removed_count = re.subn(r'\\s+', ' ', text.strip())
     # Remove non-English alphabetic characters and count them
    non_english_alphabet_removed_count = len(text) - len(re.sub(r'[^\\u0041-\\u005A\\u0061-\\u007A]','', text))
    non_english_alphabets_removed = re.findall(r'[^\\u0041-\\u005A\\u0061-\\u007A]', text)[:50]
    print("First 100 non-English alphabetic characters removed:", non_english_alphabets_removed)


    # Update total counts
    self.total_link_count += link_count
    self.total_word_with_num_count += word_with_num_count
    self.total_whitespace_removed_count += whitespace_removed_count
    self.total_punctuation_removed_count += punctuation_removed_count
    self.total_non_english_alphabet_removed_count += non_english_alphabet_removed_count



    return text.strip()

def print_total_counts(self):
    print("Total counts:")
    print(f"Link count: {self.total_link_count}")
    print(f"Word with number count: {self.total_word_with_num_count}")
    print(f"Whitespace removed count: {self.total_whitespace_removed_count}")
    print(f"Punctuation removed count: {self.total_punctuation_removed_count}")
    print(f"Other Languages alphabet detect: {self.total_non_english_alphabet_removed_count}")
