# Basic CSV Cleaning

In [1]:
import os
import pandas as pd

In [None]:
conv_train = r"../datasets/Conversational Training/"
health_monitor = r"../datasets/Health Monitoring/"
mh_predict = r"../datasets/Mental Health Prediction/"

loc_1, loc_2 = os.path.join(conv_train, os.listdir(conv_train)[0]), os.path.join(conv_train, os.listdir(conv_train)[1])

print(loc_1, loc_2)

In [None]:
import re, csv

def remove_special_characters(text):
  cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?;:\'"-]+', '', text)
  return cleaned_text

input_file = loc_2
output_file = "output.csv"
with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', newline='', encoding='utf-8') as outfile:

    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    for row in reader:
        cleaned_row = []
        for cell in row:
            cleaned_cell = remove_special_characters(cell)
            cleaned_row.append(cleaned_cell)
        writer.writerow(cleaned_row)

print(f"Cleaned data written to '{output_file}'")

# CSV cleaning (grouping, sorting, and comment labelling)

In [44]:
import pandas as pd

file_loc_x = r"C:\Users\parvs\VSC Codes\Python-root\_Projects_Personal\mindEase_v2\scripts\training\Intent Recognition\Mediatory Saves\processed_dataset.csv"
df = pd.read_csv(file_loc_x, header=0, names=["text", "label"])

label_counts = df['label'].value_counts().reset_index()
label_counts.columns = ['label', 'total_amount']

sorted_labels = label_counts.sort_values(by='total_amount', ascending=True)['label']
modified_data = []

for label in sorted_labels:
    group = df[df['label'] == label]
    modified_data.extend(group.values.tolist())  # Add patterns for the label
    modified_data.append([None, None])  # Add a gap after each group

modified_df = pd.DataFrame(modified_data, columns=["text", "label"])
modified_df.to_csv("your_modified_dataset.csv", index=False)

## Count Labels in DS

In [None]:
import pandas as pd
from pprint import pprint

file_loc_x = r"C:\Users\parvs\VSC Codes\Python-root\_Projects_Personal\mindEase_v2\scripts\training\Intent Recognition\Mediatory Saves\your_modified_dataset.csv"
df = pd.read_csv(file_loc_x, header=0, names=["text", "label"])

label_counts = df['label'].value_counts().reset_index()
label_counts.columns = ['label', 'total_amount']

pd.set_option('display.max_rows', None)
sorted_label_counts = label_counts.sort_values(by='total_amount', ascending=True)

pprint(sorted_label_counts)

## Dataset Balancing

In [None]:
import pandas as pd
from pprint import pprint

# Load the CSV into a Pandas DataFrame
file_loc_x = r"C:\Users\parvs\VSC Codes\Python-root\_Projects_Personal\mindEase_v2\scripts\training\Intent Recognition\Mediatory Saves\your_modified_dataset.csv"
df = pd.read_csv(file_loc_x, header=0, names=["text", "label"])

# Clean the labels: remove preceding/trailing spaces and convert to lowercase
df['label'] = df['label'].str.strip().str.lower()

# Count the total occurrences of each label
label_counts = df['label'].value_counts().reset_index()
label_counts.columns = ['label', 'total_amount']

# Create a new DataFrame to store the processed data
processed_df = pd.DataFrame(columns=["text", "label"])

# Process labels with more than 20 entries
for label in label_counts['label']:
    count = label_counts[label_counts['label'] == label]['total_amount'].values[0]
    if count > 20:
        # Randomly sample 20 entries for labels with more than 20 entries
        sampled_df = df[df['label'] == label].sample(n=20, random_state=42)
        processed_df = pd.concat([processed_df, sampled_df], ignore_index=True)
    elif count < 20:
        # Print labels with fewer than 20 entries
        print(f"{label}: {20-count}")
        processed_df = pd.concat([processed_df, df[df['label'] == label]], ignore_index=True)

# Reset index of the DataFrame
processed_df = processed_df.reset_index(drop=True)

# Save the modified DataFrame to a new CSV file
processed_df.to_csv(r"C:\Users\parvs\VSC Codes\Python-root\_Projects_Personal\mindEase_v2\datasets\Intent Training\balanced_dataset.csv", index=False)

print("Dataset processed and saved to 'balanced_dataset.csv'")

## Code to remove all the rows with "#" (comment)

In [30]:
import pandas as pd

df = pd.read_csv(r"C:\Users\parvs\VSC Codes\Python-root\_Projects_Personal\mindEase_v2\scripts\training\Intent Recognition\Mediatory Saves\processed_dataset.csv")

df = df[~df["text"].str.startswith("#", na=False)]

df.to_csv(r"C:\Users\parvs\VSC Codes\Python-root\_Projects_Personal\mindEase_v2\scripts\training\Intent Recognition\Mediatory Saves\processed_dataset.csv", index=False) 