In [None]:
import pandas as pd
import re
import emoji
import os
import csv

class TweetDataLoader:
    def __init__(self, data_folder):
        self.data_folder = data_folder
        self.total_link_count = 0
        self.total_word_with_num_count = 0
        self.total_whitespace_removed_count = 0
        self.total_punctuation_removed_count = 0
        self.total_emoji_removed_count = 0

    def load_data(self, file_name, columns):
        file_path = os.path.join(self.data_folder, file_name)
        try:
            df = pd.read_csv(file_path, sep='\t', names=columns, skiprows=1)
            return df
        except pd.errors.ParserError as e:
            print(f"Error parsing file {file_path}: {e}")
            print("Attempting to skip problematic lines and continue loading...")
            df = self.skip_problematic_lines(file_path, columns)
            return df
        except FileNotFoundError:
            print(f"File not found: {file_path}")
            return None

    def skip_problematic_lines(self, file_path, columns):
        lines = []
        with open(file_path, 'r', encoding='utf-8') as file:
            header = file.readline().strip()  # Read the first line (header)
            for line in file:
                if line.strip() == header:
                    continue  # Skip the row if it matches the header
                fields = line.strip().split('\t')
                if len(fields) == len(columns):
                    lines.append(fields)
                else:
                    print(f"Skipping line with unexpected number of fields: {line.strip()}")
        df = pd.DataFrame(lines, columns=columns)
        return df

    def preprocess_data(self, df, text_column):
        if df is None:
            return None
        df[text_column] = df[text_column].apply(self.preprocess_text)
        if 'tweet_url' in df.columns:  # Check if 'tweet_url' column exists
            df = df.drop(columns=['tweet_url'])  # Remove the 'tweet_url' column
        return df, self.get_total_counts()

    def preprocess_text(self, text):
        capital_word_count = 0
        link_count = 0
        word_with_num_count = 0
        whitespace_removed_count = 0
        punctuation_removed_count = 0
        emoji_removed_count = 0

        text = re.sub(r'\([^)]*\)', '', text)
        text_lower = text.lower()
        text, link_count = re.subn(r'https?:\/\/\S+', '', text)
        text, punctuation_removed_count = re.subn(r'[^\w\s]', '', text)
        text, word_with_num_count = re.subn(r'\w*\d\w*', '', text)
        text, whitespace_removed_count = re.subn(r'\s+', ' ', text.strip())

        # Remove emojis
        text, emoji_removed_count = self.remove_emojis(text)

        self.total_link_count += link_count
        self.total_word_with_num_count += word_with_num_count
        self.total_whitespace_removed_count += whitespace_removed_count
        self.total_punctuation_removed_count += punctuation_removed_count
        self.total_emoji_removed_count += emoji_removed_count

        return text.strip()

    def remove_emojis(self, text):
        # Remove emojis using regular expressions
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text), len(emoji_pattern.findall(text))

    def get_total_counts(self):
        counts_dict = {
            "Link count": [self.total_link_count],
            "Word remove with number count": [self.total_word_with_num_count],
            "Whitespace removed count": [self.total_whitespace_removed_count],
            "Punctuation removed count": [self.total_punctuation_removed_count],
            "Emoji removed count": [self.total_emoji_removed_count]
        }
        return pd.DataFrame(counts_dict)

 
    def preprocess_and_save(data_loader, file_name, columns, text_column):
        df = data_loader.load_data(file_name, columns)
        if df is not None:
            df_preprocessed, total_counts_df = data_loader.preprocess_data(df, text_column)
            if df_preprocessed is not None:
                output_file_name = file_name.replace('.tsv', '_preprocessed.csv')
                with open(output_file_name, 'w', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(['tweet_id', 'tweet_text', 'class_label'])  # Write predefined column names
                    for index, row in df_preprocessed.iterrows():
                        writer.writerow(row)
                print(f"Preprocessed file saved: {output_file_name}")

              
                print("Total Counts:")
                print(total_counts_df)
                


# Example usage:
data_folder = "C:\\Users\\ASUS\\Downloads\\Data minning project\\Data_Minning_Project\\Preprocessing"

loader = TweetDataLoader(data_folder)

# English data
english_folder = os.path.join(data_folder, "english")
english_columns = ['Sentence_id', 'Text', 'class_label']
for file_name in os.listdir(english_folder):
    if file_name.endswith(".tsv"):
        TweetDataLoader.preprocess_and_save(loader, os.path.join("english", file_name), english_columns, 'Text')

# Spanish data
spanish_folder = os.path.join(data_folder, "spanish")
spanish_columns = ['tweet_id', 'tweet_url', 'tweet_text', 'class_label']
for file_name in os.listdir(spanish_folder):
    if file_name.endswith(".tsv"):
        TweetDataLoader.preprocess_and_save(loader, os.path.join("spanish", file_name), spanish_columns, 'tweet_text')

# Dutch data
dutch_folder = os.path.join(data_folder, "dutch")
dutch_columns = ['tweet_id', 'tweet_url', 'tweet_text', 'class_label']
for file_name in os.listdir(dutch_folder):
    if file_name.endswith(".tsv"):
        TweetDataLoader.preprocess_and_save(loader, os.path.join("dutch", file_name), dutch_columns, 'tweet_text')

# Arabic data
arabic_folder = os.path.join(data_folder, "arabic")
arabic_columns = ['tweet_id', 'tweet_url', 'tweet_text', 'class_label']
for file_name in os.listdir(arabic_folder):
    if file_name.endswith(".tsv"):
        TweetDataLoader.preprocess_and_save(loader, os.path.join("arabic", file_name), arabic_columns, 'tweet_text')
