In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [2]:
dataset_path = 'womens_journal_entries.csv'  
balanced_final_dataset = pd.read_csv(dataset_path)

In [9]:
print("Dataset Preview:")
print(balanced_final_dataset.head())

Dataset Preview:
                                          statements status
0  It's Monday. I feel like I'm going to have a g...  happy
1  I'm so grateful to have such a wonderful mothe...  happy
2  I am not sure if I should tell you this, but I...    sad
3  I can't believe I've been on this diet for 2 w...    sad
4  I took a shower today for the first time in 3 ...  happy


In [10]:
# Check for missing values in the dataset
print("\nMissing Values:")
print(balanced_final_dataset.isnull().sum())


Missing Values:
statements    0
status        0
dtype: int64


In [11]:
# Get basic information about the dataset (e.g., data types, number of entries)
print("\nDataset Information:")
balanced_final_dataset.info()


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5002 entries, 0 to 5001
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   statements  5002 non-null   object
 1   status      5002 non-null   object
dtypes: object(2)
memory usage: 78.3+ KB


In [12]:
print("\nLabel Distribution:")
print(balanced_final_dataset['status'].value_counts())


Label Distribution:
status
sad      2591
happy    2411
Name: count, dtype: int64


In [13]:
# Drop any rows with missing values (if any exist)
balanced_final_dataset.dropna(inplace=True)

# Basic text cleaning function (converts to lowercase, removes special characters)
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove special characters
    return text

# Apply the cleaning function to the text column
balanced_final_dataset['statements'] = balanced_final_dataset['statements'].apply(clean_text)

# Verify the cleaning process
print("\nCleaned Text Sample:")
print(balanced_final_dataset['statements'].head())



Cleaned Text Sample:
0    its monday i feel like im going to have a good...
1    im so grateful to have such a wonderful mother...
2    i am not sure if i should tell you this but i ...
3    i cant believe ive been on this diet for 2 wee...
4    i took a shower today for the first time in 3 ...
Name: statements, dtype: object


In [30]:
from sklearn.model_selection import train_test_split

# Define the features (X) and labels (y)
X = balanced_final_dataset['statements']
y = balanced_final_dataset['status']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the training and testing sets
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


Training set size: 4001 samples
Test set size: 1001 samples


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data using TF-IDF (Term Frequency - Inverse Document Frequency)
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.85, stop_words='english')

# Fit the vectorizer on the training data and transform both the train and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Check the dimensions of the TF-IDF vectors
print(f"TF-IDF training data shape: {X_train_tfidf.shape}")
print(f"TF-IDF test data shape: {X_test_tfidf.shape}")


TF-IDF training data shape: (4001, 3586)
TF-IDF test data shape: (1001, 3586)


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=50)
rf_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
rf_pred = rf_model.predict(X_test_tfidf)

# Calculate the accuracy of the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")


Random Forest Accuracy: 0.78


In [35]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model
lr_model = LogisticRegression(random_state=60, max_iter=200)
lr_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
lr_pred = lr_model.predict(X_test_tfidf)

# Calculate the accuracy of the Logistic Regression model
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.2f}")


Logistic Regression Accuracy: 0.80


In [23]:
from sklearn.tree import DecisionTreeClassifier

# Train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=60)
dt_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
dt_pred = dt_model.predict(X_test_tfidf)

# Calculate the accuracy of the Decision Tree model
dt_accuracy = accuracy_score(y_test, dt_pred)
print(f"Decision Tree Accuracy: {dt_accuracy:.2f}")


Decision Tree Accuracy: 0.71
