# **Creating a unified dataset**

## **Import necessary Modules**

In [1]:
# For working with tabular data (CSV/Excel) and time series analysis
import pandas as pd  

# Handling JSON config files and API responses  
import json  

# Text cleaning, pattern extraction, and string validation
import re  

In [1]:
# Load CSV file
csv_path = "mental_health_conversation2.csv"  # Change to your actual file path
df = pd.read_csv(csv_path)

# Select only the required columns (e.g., "question" and "answer")
selected_columns = df[["questionTitle", "answerText"]]  # Modify column names if needed

# Save the selected columns into a new CSV file
new_csv_path = "mental_health_conversation.csv"
selected_columns.to_csv(new_csv_path, index=False)

print(f"New CSV file saved as '{new_csv_path}' with selected columns!")


New CSV file saved as 'mental_health_conversation.csv' with selected columns!


In [2]:
# Load CSV file
csv_path = "Mental_Health_FAQ.csv"  # Change to your actual file path
df = pd.read_csv(csv_path)

# Select only the required columns (e.g., "question" and "answer")
selected_columns = df[["Questions", "Answers"]]  # Modify column names if needed

# Save the selected columns into a new CSV file
new_csv_path = "Mental_Health_FAQs.csv"
selected_columns.to_csv(new_csv_path, index=False)

print(f"New CSV file saved as '{new_csv_path}' with selected columns!")


New CSV file saved as 'Mental_Health_FAQs.csv' with selected columns!


In [3]:
# Load the CSV file
csv_path = "train.csv"  # Replace with your actual file path
df = pd.read_csv(csv_path)

# Rename columns (Modify names as needed)
df = df.rename(columns={"Context": "Questions", "Response": "Answers"})

# Overwrite the same CSV file with updated column names
df.to_csv(csv_path, index=False)

print(f"Column names updated successfully in '{csv_path}'!")


Column names updated successfully in 'train.csv'!


In [4]:
# Load the CSV file
csv_path = "mental_health_conversation.csv"  # Replace with your actual file path
df = pd.read_csv(csv_path)

# Rename columns (Modify names as needed)
df = df.rename(columns={"questionTitle": "Questions", "answerText": "Answers"})

# Overwrite the same CSV file with updated column names
df.to_csv(csv_path, index=False)

print(f"Column names updated successfully in '{csv_path}'!")


Column names updated successfully in 'mental_health_conversation.csv'!


## **Storing all the data into an unified data**

In [1]:
# Function to process text file
def process_text_file(text):
    entries = []
    blocks = re.split(r"---Title---", text)
    
    for block in blocks[1:]:  # Skip first split as it is before first ---Title---
        parts = re.split(r"---Data---", block)
        if len(parts) == 2:
            question = parts[0].strip()
            answer = parts[1].strip()
            entries.append({"Title": question, "Content": answer})
    
    return entries

# Function to process CSV file
def process_csv_file(csv_path):
    df = pd.read_csv(csv_path)
    return df.rename(columns={"Questions": "Title", "Answers": "Content"}).to_dict(orient="records")

# Load and process text data
with open("NCERT_class11.txt", "r", encoding="utf-8") as file1:
    text_data1 = file1.read()
text_entries1 = process_text_file(text_data1)

with open("NCERT_class12.txt", "r", encoding="utf-8") as file2:
    text_data2 = file2.read()
text_entries2 = process_text_file(text_data2)

with open("NOBA_book1_content.txt", "r", encoding="utf-8") as file3:
    text_data3 = file3.read()
text_entries3 = process_text_file(text_data3)

with open("psychology_today_articles.txt", "r", encoding="utf-8") as file4:
    text_data4 = file4.read()
text_entries4 = process_text_file(text_data4)

# Load and process CSV data
csv_entries1 = process_csv_file("mental_health_conversation.csv")

csv_entries2 = process_csv_file("Mental_Health_FAQs.csv")

csv_entries3 = process_csv_file("train.csv")

# Merge both datasets
unified_data = text_entries1 + text_entries2 + text_entries3 + text_entries4 + csv_entries1 + csv_entries2 + csv_entries3

# Save as JSON
with open("unified_dataset.json", "w", encoding="utf-8") as json_file:
    json.dump(unified_data, json_file, ensure_ascii=False, indent=4)

print("Unified dataset created successfully!")

Unified dataset created successfully!


In [6]:
# Load the JSON data (assumed to be a list)
with open("unified_dataset.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Verify it's a list and get first 10 items
if isinstance(data, list):
    print(f"Total items: {len(data)}")
    print("\nFirst 10 items:")
    for i, item in enumerate(data[:10], 1):
        print(f"\nItem {i}:")
        print(item)
else:
    print("Warning: JSON data is not a list. Showing full content:")
    print(data)

Total items: 5700

First 10 items:

Item 1:
{'Title': 'WHAT IS PSYCHOLOGY?', 'Content': 'Any knowledge discipline is hard to define. Firstly, because it evolves continuously. Secondly, because the range of phenomena it studies cannot be captured by any one definition. This is even more true of psychology. Long time back, students like yourself were told that the term psychology is derived from two Greek words psyche meaning soul and logos meaning science or study of a subject. Thus, psychology was a study of the soul or mind. But since then it has moved away considerably from this focus and established itself as a scientific discipline which deals with processes underlying human experience and behaviour. The range of phenomena it studies, some of which we mentioned above, are spread over several levels, viz. individual, dyadic (two person) group, and organisational. They also have biological as well as social bases. Naturally, therefore, the methods required to study them also vary gre