In [37]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, KFold

In [38]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
# Load the datasets
file1_path = "/content/drive/My Drive/EECS 487/Project/Combined Data.csv"
file2_path = "/content/drive/My Drive/EECS 487/Project/sentiment-mental-health.csv"

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Standardizing column names
df1 = df1.rename(columns={'Unnamed: 0': 'unique_id'})
df2['unique_id'] = range(len(df1), len(df1) + len(df2))  # Assigning new unique IDs for df2

# Concatenating datasets
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the unified dataset
unified_file_path = "unified_dataset.csv"
combined_df.to_csv(unified_file_path, index=False)

In [41]:
def preprocess_text(text):

    if not isinstance(text, str):
        return ""

    # Lowercasing
    text = text.lower()

    # Sentence Tokenization
    sentences = sent_tokenize(text)

    # Word Tokenization
    tokens = []
    for sentence in sentences:
        tokens.extend(word_tokenize(sentence))

    tokens = [re.sub(r'[^a-zA-Z0-9]', '', word) for word in tokens if word.isalnum()]

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Reconstructing text
    return ' '.join(tokens)

# Apply preprocessing
combined_df['cleaned_statement'] = combined_df['statement'].apply(preprocess_text)

# preprocessed_file_path = "preprocessed_dataset.csv"
combined_df = combined_df.drop(columns=['statement'])
# combined_df.to_csv(preprocessed_file_path, index=False)

# Display the first few rows of the cleaned dataset
combined_df.head()

Unnamed: 0,unique_id,status,cleaned_statement
0,0,Anxiety,oh gosh
1,1,Anxiety,trouble sleeping confused mind restless heart ...
2,2,Anxiety,wrong back dear forward doubt stay restless re...
3,3,Anxiety,shifted focus something else still worried
4,4,Anxiety,restless restless month boy mean


In [42]:
train_df, temp_df = train_test_split(combined_df, test_size=0.2, random_state=42, stratify=combined_df['status'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['status'])

train_df.to_csv("train_dataset.csv", index=False)
val_df.to_csv("val_dataset.csv", index=False)
test_df.to_csv("test_dataset.csv", index=False)

# Implementing K-Fold Cross-Validation (k=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
folds = []
for train_index, val_index in kf.split(train_df):
    train_fold = train_df.iloc[train_index]
    val_fold = train_df.iloc[val_index]
    folds.append((train_fold, val_fold))

In [43]:
# Save the preprocessed dataset
preprocessed_file_path = "preprocessed_dataset.csv"
combined_df.to_csv(preprocessed_file_path, index=False)

# Display the first few rows of the cleaned dataset
combined_df.head()

Unnamed: 0,unique_id,status,cleaned_statement
0,0,Anxiety,oh gosh
1,1,Anxiety,trouble sleeping confused mind restless heart ...
2,2,Anxiety,wrong back dear forward doubt stay restless re...
3,3,Anxiety,shifted focus something else still worried
4,4,Anxiety,restless restless month boy mean
