# Data Preprocessing

In [11]:
import pandas as pd
from clean import clean_text
import re
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist



In [12]:
data_frame = pd.DataFrame( {'thought': [], 'original_label': []})

## CVS

In [13]:
# reframing_dataset
neg_thought = []

# read the dataset and take just the negative thought and the label
data = pd.read_csv("/Users/sylvainestebe/Code/nlp_project/cognitive_distortion_project/data/reframing_dataset.csv")
data_frame["thought"] = data["thought"]
data_frame["original_label"] = data["thinking_traps_addressed"]
data_frame = data_frame.drop_duplicates()
data_frame

Unnamed: 0,thought,original_label
0,Someone I trusted stole something valuable of ...,emotional reasoning
2,She doesn't respect me.,overgeneralizing
4,My friend is ignoring his recently-deceased wife.,disqualifying the positive
5,My friend is ignoring his recently-deceased wife.,mind reading
6,he took me for granted!,"labeling,catastrophizing"
...,...,...
592,I'm so exhausted,negative feeling or emotion
594,ill never be able to find a good job,"fortune telling,catastrophizing"
596,I am being disrespected,"labeling,disqualifying the positive,negative f..."
598,maybe she doesn't like me anymore.,mind reading


## JSON

In [14]:
# read the dataset and rename the colonne
with open('/Users/sylvainestebe/Code/nlp_project/cognitive_distortion_project/data/thinking_traps.jsonl') as f:
    df = pd.read_json(f, lines=True)
    df = df.rename(columns={"completion": "original_label","prompt": "thought"})
# replace  arrow with splace
for i in range(len(df)):

    df["thought"][i] = df["thought"][i].replace("->", "")

# remove the duplicates
df = df.drop_duplicates()
print(len(df))
df = df.drop(df[df['original_label'] == ' Not distorted'].index)
print(len(df))
df

1077
1000


Unnamed: 0,thought,original_label
0,I'm always late,Overgeneralization
1,I'm late for the meeting. Everyone will look d...,Mind reading
2,I'm late for the meeting. Everyone will look d...,Fortune telling
3,I'm late for the meeting. This shows what a je...,Labeling
4,I'm late for the meeting. I'll make a fool of ...,Labeling
...,...,...
1072,Not having made rent and having to borrow mone...,Labeling
1073,I’m a loser I hate myself and I’m ready to die,Labeling
1074,People around me do not need my presence I'm w...,Emotional reasoning
1075,I realize that I still don’t have anyone who I...,Overgeneralizing


## MERGE

In [15]:
corpus_data_xp = data_frame.append(df,ignore_index=True)
corpus_data = corpus_data_xp.drop_duplicates(subset="thought")
corpus_data = corpus_data.reset_index(drop=True)
corpus_data.to_csv("/Users/sylvainestebe/Code/nlp_project/cognitive_distortion_project/data/corpus_disto.csv")

  corpus_data_xp = data_frame.append(df,ignore_index=True)


In [16]:
explore = (corpus_data.original_label.value_counts()
                   .to_frame()
                   .reset_index()
                   .rename(columns={'index': 'original_label', 'original_label': 'count'}))
explore.to_html('/Users/sylvainestebe/Code/nlp_project/cognitive_distortion_project/export/exploration_label.html', index=False)

# Clean


In [17]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Summary: turns text input string into list of cleaned word tokens
    
    Arguments:
        text: str of text
    
    Returns:
        lemmatized_words: str, lemmatized words from original text after cleaning
    """
    
    # remove numbers
    clean_text = re.sub(r'[0-9]+', '', text)
    
    # remove punctuation
    clean_text = re.sub(r'[^\w\s]', '', clean_text)
    
    # convert everything to lowercase
    clean_text = clean_text.lower()
    
    # tokenize
    wt = WhitespaceTokenizer()
    words = wt.tokenize(clean_text)
    
    # remove stop words
    cleaned_words = []
    for w in words:
        if w not in stop_words:
            cleaned_words.append(w)
            
    # lemmatize words
    wnl = WordNetLemmatizer()
    wnl_lemmatized_tokens = []
    for token in cleaned_words:
        wnl_lemmatized_tokens.append(wnl.lemmatize(token))
    
    lemmatized_words = ' '.join(wnl_lemmatized_tokens)
    
    return lemmatized_words

In [24]:
corpus_data['cleaned_thought'] = corpus_data['thought'].apply(lambda x: clean_text(x))
corpus_data.to_csv("/Users/sylvainestebe/Code/nlp_project/cognitive_distortion_project/data/corpus_disto.csv")

## Frequence

In [19]:
sample_text = ' '.join(list(corpus_data['thought'].values))

clean_sample_text = ' '.join(list(corpus_data['cleaned_thought'].values))

# tokenize
wt = WhitespaceTokenizer()
clean_tokens = wt.tokenize(clean_sample_text)

freq_dist = FreqDist(clean_tokens)
print(freq_dist)
freq_dist.most_common(500)


<FreqDist with 1691 samples and 6907 outcomes>


[('im', 184),
 ('like', 88),
 ('friend', 87),
 ('never', 81),
 ('get', 80),
 ('feel', 70),
 ('job', 68),
 ('work', 66),
 ('dont', 64),
 ('time', 62),
 ('cant', 51),
 ('didnt', 50),
 ('want', 48),
 ('doesnt', 45),
 ('good', 43),
 ('made', 41),
 ('enough', 40),
 ('people', 40),
 ('bad', 39),
 ('going', 39),
 ('something', 38),
 ('got', 38),
 ('much', 36),
 ('must', 36),
 ('thought', 35),
 ('anything', 34),
 ('know', 34),
 ('go', 34),
 ('person', 32),
 ('think', 30),
 ('one', 30),
 ('make', 29),
 ('done', 26),
 ('failure', 25),
 ('ill', 25),
 ('felt', 25),
 ('angry', 24),
 ('better', 24),
 ('love', 23),
 ('hate', 23),
 ('really', 23),
 ('bos', 23),
 ('sister', 23),
 ('thing', 23),
 ('need', 22),
 ('anymore', 22),
 ('could', 22),
 ('husband', 22),
 ('able', 21),
 ('mother', 21),
 ('care', 21),
 ('day', 21),
 ('always', 21),
 ('week', 21),
 ('life', 20),
 ('home', 20),
 ('someone', 19),
 ('everything', 19),
 ('right', 19),
 ('last', 19),
 ('party', 19),
 ('everyone', 18),
 ('talk', 18),
 ('

In [20]:
word_counts = freq_dist.items()

# convert the tuple pairs into a dictionary
word_freq_map = {word: count for word, count in word_counts}

In [21]:
# convert the dictionary to a Pandas DataFrame
df = pd.DataFrame.from_dict(word_freq_map, orient='index', columns=['frequency'])

# reset the index to make the word column
df = df.reset_index()

# sort the DataFrame by frequency in descending order
df = df.sort_values('frequency', ascending=False)

# rename the index column to 'word'
df.columns = ['word', 'frequency']
df.drop(df[df['word'] == 'im'].index, inplace=True)

# print the DataFrame
print(df)


          word  frequency
88        like         88
12      friend         87
48       never         81
73         get         80
63        feel         70
...        ...        ...
1068    canned          1
1069       air          1
1070      loud          1
1071     noise          1
1690  maintain          1

[1690 rows x 2 columns]


In [22]:
import plotly.express as px
df["frequency"]
fig = px.line(df, x="word", y="frequency", title='Frequence per word')
fig.show()