## Pre-Processing test dataset

In [1]:
import pandas as pd

### Load the test dataset

In [2]:
df=pd.read_csv('../data/mbti_1.csv')

In [3]:
df.head(n=3)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [5]:
df.dropna(inplace=True)

In [6]:
df['type'].value_counts()

type
INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: count, dtype: int64

In [7]:
# add new 4 columns for each axis of the personality
df['IE'] = df['type'].apply(lambda x: x[0])
df['NS'] = df['type'].apply(lambda x: x[1])
df['TF'] = df['type'].apply(lambda x: x[2])
df['JP'] = df['type'].apply(lambda x: x[3])

df.head(n=2)

Unnamed: 0,type,posts,IE,NS,TF,JP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,I,N,F,J
1,ENTP,'I'm finding the lack of me in these posts ver...,E,N,T,P


In [8]:
#  assign 0 to Introvert, Intuition, Thinking, Judging and 1 to Extrovert, Sensing, Feeling, Perceiving
df['IE'] = df['IE'].apply(lambda x: 0 if x == 'I' else 1)
df['NS'] = df['NS'].apply(lambda x: 0 if x == 'N' else 1)
df['TF'] = df['TF'].apply(lambda x: 0 if x == 'T' else 1)
df['JP'] = df['JP'].apply(lambda x: 0 if x == 'J' else 1)

df.head(n=2)

Unnamed: 0,type,posts,IE,NS,TF,JP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0,0,1,0
1,ENTP,'I'm finding the lack of me in these posts ver...,1,0,0,1


In [9]:
# count the number of each value in each column
print(df['IE'].value_counts())
print(df['NS'].value_counts())
print(df['TF'].value_counts())
print(df['JP'].value_counts())

IE
0    6676
1    1999
Name: count, dtype: int64
NS
0    7478
1    1197
Name: count, dtype: int64
TF
1    4694
0    3981
Name: count, dtype: int64
JP
1    5241
0    3434
Name: count, dtype: int64


In [10]:
#Functions for Preprocessing the text
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
stop_words = set(stopwords.words('english'))


def mystopwords(text):
    return ' '.join([w for w in word_tokenize(text) if not w in stop_words])

import re
def clean_text(string):
  clean=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ',string) #remove url
  clean=re.sub('[\n]',' ',clean) #remove newline character
  clean=re.sub('[^a-zA-Z]',' ',clean.lower()) #remove non alphabetic characters
  clean=re.sub(r'[,]', ' ', clean)
  clean=mystopwords(clean) #remove stopwords
  return clean

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omarahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/omarahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# clean the posts
df['cleaned_posts'] = df['posts'].apply(clean_text)
df.head(n=10)

Unnamed: 0,type,posts,IE,NS,TF,JP,cleaned_posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0,0,1,0,intj moments sportscenter top ten plays pranks...
1,ENTP,'I'm finding the lack of me in these posts ver...,1,0,0,1,finding lack posts alarming sex boring positio...
2,INTP,'Good one _____ https://www.youtube.com/wat...,0,0,0,1,good one course say know blessing curse absolu...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",0,0,0,0,dear intp enjoyed conversation day esoteric ga...
4,ENTJ,'You're fired.|||That's another silly misconce...,1,0,0,0,fired another silly misconception approaching ...
5,INTJ,'18/37 @.@|||Science is not perfect. No scien...,0,0,0,0,science perfect scientist claims scientific in...
6,INFJ,"'No, I can't draw on my own nails (haha). Thos...",0,0,1,0,draw nails haha done professionals nails yes g...
7,INTJ,'I tend to build up a collection of things on ...,0,0,0,0,tend build collection things desktop use frequ...
8,INFJ,"I'm not sure, that's a good question. The dist...",0,0,1,0,sure good question distinction two dependant p...
9,INTP,'https://www.youtube.com/watch?v=w8-egj0y8Qs||...,0,0,0,1,position actually let go person due various re...


In [12]:
# drop the posts , type columns
df.drop(columns=['posts','type'],inplace=True)
df.head(n=2)

Unnamed: 0,IE,NS,TF,JP,cleaned_posts
0,0,0,1,0,intj moments sportscenter top ten plays pranks...
1,1,0,0,1,finding lack posts alarming sex boring positio...


In [13]:
# check if there are empty posts and remove them
df = df[df['cleaned_posts'] != '']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8674 entries, 0 to 8674
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   IE             8674 non-null   int64 
 1   NS             8674 non-null   int64 
 2   TF             8674 non-null   int64 
 3   JP             8674 non-null   int64 
 4   cleaned_posts  8674 non-null   object
dtypes: int64(4), object(1)
memory usage: 406.6+ KB


In [14]:
# rename the cleaned_posts column to posts
df.rename(columns={'cleaned_posts':'posts'},inplace=True)

In [15]:
df.info() # check the data

<class 'pandas.core.frame.DataFrame'>
Index: 8674 entries, 0 to 8674
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   IE      8674 non-null   int64 
 1   NS      8674 non-null   int64 
 2   TF      8674 non-null   int64 
 3   JP      8674 non-null   int64 
 4   posts   8674 non-null   object
dtypes: int64(4), object(1)
memory usage: 406.6+ KB


In [16]:
# save the cleaned data
df.to_csv('../data/mbti_1_cleaned.csv',index=False)