In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import joblib

In [3]:
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to C:\Users\SHREYA
[nltk_data]     MISHRA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\SHREYA
[nltk_data]     MISHRA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\SHREYA
[nltk_data]     MISHRA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
df = pd.read_csv("data/local_ocean_synthetic.csv")

In [6]:
df.shape

(1160, 6)

In [7]:
df.sample(10)

Unnamed: 0,Text,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism
381,Impeccability is the source of wonder.,5.1,3.2,4.1,4.7,0.4
911,I'm impressed by the author's research and ana...,4.3,4.0,3.9,4.1,3.3
281,"Those who dare, author history.",3.8,4.3,5.0,3.0,0.1
572,"Philosophy: Iâ€™m not weird, Iâ€™m a limited editi...",5.3,1.6,3.7,5.1,1.1
861,This blog post is a goldmine of information.,4.3,3.7,4.0,4.1,3.3
683,"Adulthood is just saying 'But after this week,...",4.0,2.3,3.8,2.7,4.0
420,â€˜Just one more episodeâ€™ - famous last words. ðŸ“ºâŒ›,4.1,1.5,4.4,3.4,3.7
925,This post has inspired me to take action.,4.1,4.3,3.6,4.2,3.1
1089,I'm enchanted by the taste of this gourmet cho...,4.5,3.9,3.7,4.2,2.8
976,I'm grateful for the insights shared in this a...,4.1,4.2,3.8,4.4,3.5


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1160 entries, 0 to 1159
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Text               1160 non-null   object 
 1   Openness           1160 non-null   float64
 2   Conscientiousness  1160 non-null   float64
 3   Extraversion       1160 non-null   float64
 4   Agreeableness      1159 non-null   float64
 5   Neuroticism        1159 non-null   float64
dtypes: float64(5), object(1)
memory usage: 54.5+ KB


In [9]:
df.describe()

Unnamed: 0,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism
count,1160.0,1160.0,1160.0,1159.0,1159.0
mean,3.962672,3.341724,3.486897,3.700173,2.815272
std,0.7586,0.993143,0.990037,0.802662,1.219452
min,1.3,0.7,-1.0,1.0,-1.2
25%,3.6,2.6,3.2,3.2,2.2
50%,4.1,3.5,3.7,3.9,2.9
75%,4.5,4.1,4.0,4.3,3.5
max,5.7,5.4,5.3,5.3,6.7


# Handling Missing Values

In [10]:
df.isnull().sum()

Text                 0
Openness             0
Conscientiousness    0
Extraversion         0
Agreeableness        1
Neuroticism          1
dtype: int64

In [11]:
df = df.dropna(axis=0)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1159 entries, 0 to 1158
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Text               1159 non-null   object 
 1   Openness           1159 non-null   float64
 2   Conscientiousness  1159 non-null   float64
 3   Extraversion       1159 non-null   float64
 4   Agreeableness      1159 non-null   float64
 5   Neuroticism        1159 non-null   float64
dtypes: float64(5), object(1)
memory usage: 63.4+ KB


# Text Cleaning

In [13]:
#function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [14]:
df['cleaned_text'] = df['Text'].apply(clean_text)

In [15]:
df[['Text', 'cleaned_text']].sample(5)

Unnamed: 0,Text,cleaned_text
702,Low battery is the adult version of needing to...,low battery adult version needing pee
126,Creative hobbies keep the mind sharp.,creative hobbies keep mind sharp
365,Fearless minds shape the unknown.,fearless minds shape unknown
1007,I can't get enough of this artisanal ice cream...,cant get enough artisanal ice cream flavors un...
1026,This tablet has become an essential tool for p...,tablet become essential tool productivity simp...


# Tokenization and Lemmatization
## Tokenization
- Tokenization is the process of splitting the text into individual words or tokens.

## Lemmatization
- Lemmatization reduces words to their base or root form. For example, "running" becomes "run," and "better" becomes "good."

In [16]:
# Function for tokenization and lemmatization
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

joblib.dump(preprocess_text, 'preprocess_text.pkl')

['preprocess_text.pkl']

In [17]:
df['tokens'] = df['Text'].apply(preprocess_text)

In [18]:
df.head()

Unnamed: 0,Text,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,cleaned_text,tokens
0,I love exploring new cultures through cuisine ...,4.7,3.1,3.5,3.9,2.1,love exploring new cultures cuisine travel,"[love, exploring, new, culture, cuisine, travel]"
1,My workspace is always organized; I can't focu...,2.9,4.8,2.1,3.2,2.4,workspace always organized cant focus messy en...,"[workspace, always, organized, ca, focus, mess..."
2,Large social gatherings make me feel energized...,3.1,2.9,4.6,3.5,1.7,large social gatherings make feel energized ex...,"[large, social, gathering, make, feel, energiz..."
3,I often worry about things not going as planned.,3.0,3.9,2.0,3.4,4.5,often worry things going planned,"[often, worry, thing, going, planned]"
4,Having a daily routine is comforting and helps...,2.3,4.6,1.7,3.7,2.2,daily routine comforting helps productive,"[daily, routine, comforting, help, productive]"


# Normalization
- Normalising the data between range 0-1 using MinMaxScaler

In [19]:
scaler = MinMaxScaler(feature_range=(0, 1))

In [20]:
df[['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']] = scaler.fit_transform(
    df[['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']]
)

In [21]:
df.describe()

Unnamed: 0,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism
count,1159.0,1159.0,1159.0,1159.0,1159.0
mean,0.605087,0.561985,0.712177,0.627947,0.508262
std,0.172469,0.211379,0.157213,0.186666,0.154361
min,0.0,0.0,0.0,0.0,0.0
25%,0.522727,0.404255,0.666667,0.511628,0.43038
50%,0.636364,0.595745,0.746032,0.674419,0.518987
75%,0.727273,0.723404,0.793651,0.767442,0.594937
max,1.0,1.0,1.0,1.0,1.0


### All the values are now between 0-1

In [22]:
#Saving normalised data
df.to_csv("data/standard_data.csv", index=False)

# Vectorization

In [23]:
tfidf = TfidfVectorizer(max_features=650)   

In [24]:
# Fit and transform the 'tokens' column
# Join the tokenized words back into a single string for each document
tfidf_features = tfidf.fit_transform(df['tokens'].apply(lambda x: ' '.join(x)))

# Convert the TF-IDF matrix to a DataFrame for easier manipulation
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the original DataFrame
df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)


In [25]:
df.head()

Unnamed: 0,Text,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,cleaned_text,tokens,absolutely,action,...,wonder,word,work,world,worry,worrying,would,wrong,year,yoga
0,I love exploring new cultures through cuisine ...,0.772727,0.510638,0.714286,0.674419,0.417722,love exploring new cultures cuisine travel,"[love, exploring, new, culture, cuisine, travel]",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,My workspace is always organized; I can't focu...,0.363636,0.87234,0.492063,0.511628,0.455696,workspace always organized cant focus messy en...,"[workspace, always, organized, ca, focus, mess...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Large social gatherings make me feel energized...,0.409091,0.468085,0.888889,0.581395,0.367089,large social gatherings make feel energized ex...,"[large, social, gathering, make, feel, energiz...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,I often worry about things not going as planned.,0.386364,0.680851,0.47619,0.55814,0.721519,often worry things going planned,"[often, worry, thing, going, planned]",0.0,0.0,...,0.0,0.0,0.0,0.0,0.535677,0.0,0.0,0.0,0.0,0.0
4,Having a daily routine is comforting and helps...,0.227273,0.829787,0.428571,0.627907,0.43038,daily routine comforting helps productive,"[daily, routine, comforting, help, productive]",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
import pickle
# Save the vectorizer to a pickle file
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Train-Test Split

In [28]:
X = df.drop(columns=['Text', 'cleaned_text', 'tokens', 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism'])
y = df[['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
print("Training set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

Training set: (927, 650) (927, 5)
Test set: (232, 650) (232, 5)


In [31]:
# Save training data
X_train.to_csv('data/X_train.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)

# Save testing data
X_test.to_csv('data/X_test.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

df.to_csv('data/model_data.csv', index=False)