In [1]:
import pandas as pd
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer, download
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from IPython.display import display

In [2]:
# Download stopwords if not already downloaded
download(['stopwords'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ultracode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Step 1: Load the data
essay = pd.read_csv("./Dataset/Essay_data.csv")
essay.head()

Unnamed: 0,I/E,N/S,T/F,J/P,Essay
0,I,S,T,J,My first 4 months at the EDSA have been filled...
1,I,N,F,J,I joined the academy being at a crossroads of ...
2,E,N,F,J,so far my experience has been positive and i c...
3,I,N,F,J,I have been very fortunate to have the opportu...
4,I,N,T,J,Looking back to when one got to the academy an...


In [4]:
essay.shape

(94, 5)

In [5]:
# Step 2: Clean the data
essay.dropna()

Unnamed: 0,I/E,N/S,T/F,J/P,Essay
0,I,S,T,J,My first 4 months at the EDSA have been filled...
1,I,N,F,J,I joined the academy being at a crossroads of ...
2,E,N,F,J,so far my experience has been positive and i c...
3,I,N,F,J,I have been very fortunate to have the opportu...
4,I,N,T,J,Looking back to when one got to the academy an...
...,...,...,...,...,...
89,I,S,F,J,My experience at the Explore Data Science Acad...
90,I,N,T,P,"A new city, new people and a completely new en..."
91,I,N,T,J,My experience at the academy has been one of t...
92,I,S,F,J,After spending a year at home while it was jus...


In [6]:
essay.reset_index()

Unnamed: 0,index,I/E,N/S,T/F,J/P,Essay
0,0,I,S,T,J,My first 4 months at the EDSA have been filled...
1,1,I,N,F,J,I joined the academy being at a crossroads of ...
2,2,E,N,F,J,so far my experience has been positive and i c...
3,3,I,N,F,J,I have been very fortunate to have the opportu...
4,4,I,N,T,J,Looking back to when one got to the academy an...
...,...,...,...,...,...,...
89,89,I,S,F,J,My experience at the Explore Data Science Acad...
90,90,I,N,T,P,"A new city, new people and a completely new en..."
91,91,I,N,T,J,My experience at the academy has been one of t...
92,92,I,S,F,J,After spending a year at home while it was jus...


In [47]:
stop_words = set(stopwords.words('english'))
translator = str.maketrans(punctuation, ' ' * len(punctuation))

# Step 3: Define helper functions for text processing
def remove_stop_words(tokens):    
    return " ".join([token for token in tokens.split() if token not in stop_words])

def remove_punctuation(post):
    #return "".join([word if word not in punctuation else " " for word in post]).lower()
    # Step 2: Remove punctuation and replace it with a single whitespace
    return post.translate(translator).lower()

In [8]:
clean_stop_words = remove_stop_words("I’m a part-time student @explore-software.".split())
display(clean_stop_words)
without_punct = remove_punctuation(clean_stop_words)
display(without_punct)
cleaned_text = without_punct
cleaned_text

'I’m part-time student @explore-software.'

'i’m part time student  explore software '

'i’m part time student  explore software '

In [9]:
display([gram for gram in ngrams(cleaned_text.split(),2)])

[('i’m', 'part'),
 ('part', 'time'),
 ('time', 'student'),
 ('student', 'explore'),
 ('explore', 'software')]

In [10]:
# import string
# from nltk.corpus import stopwords
# from nltk.util import ngrams
# import nltk

# # Download stopwords if not already downloaded
# nltk.download('stopwords')

# Given sentence
# sentence = "I’m a part-time student @explore-software."

# # Step 3: Split sentence into words
# words = sentence.split()

# # Step 4: Remove stopwords
# stop_words = set(stopwords.words('english'))
# filtered_words = [word for word in words if word not in stop_words]
# display(filtered_words)

# # Step 2: Remove punctuation and replace it with a single whitespace
# # str.maketrans(from, to): This method creates a translation table that maps each 
# # character in the from string to the corresponding character in the to string.
# # string.punctuation: The from string containing all punctuation characters.
# # ' ' * len(string.punctuation): The to string is a series of spaces of the same length
# # as the from string. This means each punctuation character will be replaced by a space.
# translator = str.maketrans(string.punctuation, " " * len(string.punctuation))
# sentence = " ".join(filtered_words).translate(translator)
# display(sentence)

# # Step 1: Convert all text to lowercase
# sentence = sentence.lower()
# display(sentence)

# # Step 5: Create bi-grams
# bigrams = list(ngrams(sentence.split(), 2))

# # Output the bi-grams and their count
# print("Bi-grams:", bigrams)
# print("Number of bi-grams:", len(bigrams))

In [11]:
Intuitive_count = (essay["N/S"] == "N").mean()
sensing_count = essay[essay["N/S"] == "S"]["N/S"].count()
Intuitive_count, (sensing_count/(67+26))

(0.7127659574468085, 0.27956989247311825)

In [12]:
essay["Essay without punct"] = essay["Essay"].apply(remove_punctuation)
essay["Essay without punct"].iloc[0][:10]

'my first 4'

In [27]:
tokenizer = TreebankWordTokenizer()
def tokenize_essay(essay):
    return " ".join(tokenizer.tokenize(essay))
    

essay["Clean Text"] = essay["Essay without punct"].apply(tokenize_essay)

print(f"How many tokens are in the 17th essay {len(essay["Clean Text"].iloc[16].split())}")

How many tokens are in the 17th essay 340


In [30]:
len(tokenizer.tokenize(essay["Essay"][16].translate(translator).lower()))

340

In [31]:
snowball = SnowballStemmer('english')
snowball.stem("experiences")

'experi'

In [39]:
remove_stop_words(essay["Clean Text"][80].split()).split()[23]

'times'

In [58]:
def process_text_to_string(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = remove_punctuation(text)
    # Split into words
    words = text.split()
    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [59]:
essay["Cleaner"] = essay["Clean Text"].apply(process_text_to_string)
essay["Cleaner"].iloc[30]

'experience edsa interesting one thus far pushed get shell interact people since academy realised smarter think lot valuable knowledge insight share biggest fear coming academy social aspect interact different people since able overcome fear first placed groups literally sweating heart rate high felt sick stomach idea group strangers second time split new groups wasn’t nervous become social enjoy meeting working new people working groups challenge frustrating everyone different everyone different approach learning process project management i’ve learned lot working others took backseat things even knew better way things spoke saw helped team steer team better direction become assertive thought passive person couldn’t assertive save life think assertive don’t find daunting biggest frustration working teams dealing people think know everything aren’t willing open learning something new fellow teammates person like previous group person would dictate team didn’t want collaborate team domi

In [61]:
def process_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = remove_punctuation(text)
    # Split into words
    words = text.split()
    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# Step 4: Process each essay and collect unique words
unique_words = set()

for text in essay['Cleaner']:  # Replace 'essay_column' with the actual column name containing the essays
    words = process_text(text)
    unique_words.update(words)
    
# Step 5: Count unique words
num_unique_words = len(unique_words)
print(f"Number of unique words (after removing stopwords): {num_unique_words}")

Number of unique words (after removing stopwords): 3339


In [62]:
def bag_of_words_count(words, word_dict={}):
    """
    This function takes in a list of words and returns a dictionary 
    with each word as a key, and the value represents the number of 
    times that word appeared
    """
    for word in words:
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return word_dict

bag_of_words_count(essay["Cleaner"][55].split())["time"]

3

In [66]:
from collections import Counter
# Step 4: Process each essay and count word frequencies
word_counter = Counter()

for text in essay['Cleaner']:  # Replace 'essay' with the actual column name containing the essays
    words = process_text(text)
    word_counter.update(words)

# Step 5: Calculate the total number of words and the number of words that appear at least twice
total_words = sum(word_counter.values())
words_at_least_twice = sum(count for count in word_counter.values() if count >= 2)

# Calculate the percentage
percentage = (words_at_least_twice / total_words) * 100

print(f"Percentage of words that appear at least twice: {percentage:.2f}%")

Percentage of words that appear at least twice: 90.36%


In [72]:
clean_essay = essay[["I/E","N/S","T/F","J/P","Cleaner"]]
clean_essay.head()

Unnamed: 0,I/E,N/S,T/F,J/P,Cleaner
0,I,S,T,J,first 4 months edsa filled many new experience...
1,I,N,F,J,joined academy crossroads sorts life academy o...
2,E,N,F,J,far experience positive definitely see value c...
3,I,N,F,J,fortunate opportunity join academy year sure c...
4,I,N,T,J,looking back one got academy right confidently...


In [78]:
enfj = clean_essay[(clean_essay["I/E"]=="E") | (clean_essay["N/S"]=="N") | (clean_essay["T/F"]=="F") | (clean_essay["J/P"]=="J") ]
enfj_word_counter = Counter()

for text in enfj['Cleaner']:  # Replace 'essay' with the actual column name containing the essays
    words = process_text(text)
    enfj_word_counter.update(words)

enfj_word_counter.most_common(1)

[('team', 336)]

In [79]:
list(ngrams(process_text(essay["Cleaner"][69]), 2))[108]

('may', 'better')