<a href="https://colab.research.google.com/github/RiyaBhaskar12/Personality-Detection/blob/master/PersonalityDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# prompt: Write a program for preprocessing of goemotions dataset and mbti dataset

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenization
    tokens = text.split()

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

def preprocess_goemotions(filepath):
    try:
        df = pd.read_csv('/content/go_emotions_dataset.csv')
        df['text'] = df['text'].apply(preprocess_text)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None

def preprocess_mbti(filepath):
    try:
        df = pd.read_csv('/content/mbti_1.csv')
        df['posts'] = df['posts'].apply(preprocess_text)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None


# Example usage:
# Assuming your data files are named 'goemotions_data.csv' and 'mbti_data.csv'
# and are located in your current working directory or you provide a full path.
goemotions_processed = preprocess_goemotions('goemotions_data.csv')
mbti_processed = preprocess_mbti('mbti_data.csv')


if goemotions_processed is not None:
    print("GoEmotions Dataset Preprocessing Complete.")
    # You can save the processed dataframe to a new CSV file
    # goemotions_processed.to_csv('processed_goemotions_data.csv', index=False)

if mbti_processed is not None:
    print("MBTI Dataset Preprocessing Complete.")
    # You can save the processed dataframe to a new CSV file
    # mbti_processed.to_csv('processed_mbti_data.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


GoEmotions Dataset Preprocessing Complete.
MBTI Dataset Preprocessing Complete.
