## In Colab

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# In Jupyter

In [None]:
fileName = './mbti_1.csv'

# **data 불러오기**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import re
import nltk 
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from wordcloud import WordCloud
from nltk.corpus import stopwords 
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# 데이터 불러오기
# data = pd.read_csv("drive/MyDrive/빅데이터/mbti_1.csv")
data = pd.read_csv(fileName)
data.head()

#타입에 따른 개수
data['type'].value_counts()

# 데이터 정보
data.info()

In [None]:
data.head()

In [None]:
px.pie(data,names='type',title='Personality type',hole=0.3)

# **Preprocess**

In [None]:
class Lemmatizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, sentence):
        return [self.lemmatizer.lemmatize(word) for word in sentence.split() if len(word)>2]

In [None]:

# 이모티콘 정의
smiley = [';)', ':)', ':-)', '|-)', '|-D', ' :->', '\'-)', ';->', ':*)', ';-)', 'B-)', '8-]', ':-]', 'xD', ':^D', '^^', 'XD']
unhappy = [':(', ':-(' , ';(', ':\'(', ':-c', ':-C', ':-<', ':-X', ':-x', ':-@', ':-&', ':-r', ':-V', ':@', '\-o', ':-I', 'T_T']
another = [':O', ':-o', '8-O', '8-|', ':-T', ':/', ':P', ':-p', ':3', '8-|', ':-\\', ':~/']
imoticon = smiley + unhappy + another

#이미지 정의
basic = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.svg', '.ai', '.psd', '.tiff']
more = ['.mp4', '.cr2', '.srw', '.nrw', '.tga']
image = basic + more

#필요한 배열
exclamation_count_column = []
mbti_count_column = []
clean = []
imo_count = []
img_count = []
final = []

#stop word
cachedStopWords = stopwords.words("english")

#lemmatizer
lemmatizer = WordNetLemmatizer()

# 데이터 정제 
for sentence in tqdm(data.posts):

    # 느낌표 개수 column
    exclamation_count = sentence.count("!")
    exclamation_count_column.append(exclamation_count)

    # mbti라는 단어의 개수
    sentence_lower = sentence.lower()
    mbti_count = sum(x in {'mbti',"estj","estp","esfj","esfp","entp","entj","enfp","enfj","istj","istp","isfj","isfp","intp","intj","infp","infj"} for x in nltk.wordpunct_tokenize(sentence_lower))
    mbti_count_column.append(mbti_count)

    # 이모티콘 개수
    sentence_simple_html = re.sub('(http|ftp|https|uhttp|$uhttp)://', 'URL', sentence)
    num =0
    for i in imoticon:
        num += sentence_simple_html.count(i)
    imo_count.append(num)

    # 이미지 개수
    text = sentence_simple_html.split('|||')
    text = ' '.join(text)
    text = text.split(' ')
    onlyURL = []
    for item in text:
        if 'URL' in item:
            onlyURL.append(item)
    num = 0
    for url in onlyURL:
        for tname in image:
            if tname in url[-5:]:
                num += 1
                break
    img_count.append(num)

    # 소문자 치환
    sentence = sentence.lower()
    # 마지막에 http 없애기
    sentence = re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    # 특수문자, 숫자 모두 제거
    sentence = re.sub('[^a-z]',' ',sentence)
    #stop word 제거
    #sentence = " ".join([lemmatizer.lemmatize(w) for w in sentence.split(' ') if w not in cachedStopWords])
    # 앞뒤 자르기
    sentence = sentence.strip()
    # 공백 여러개면 하나로 줄이기
    sentence = ' '.join(sentence.split())


    clean.append(sentence)



# **최종 데이터 Format**

final : [post 내용 , 이미지 개수 , 이모티콘 개수 , 느낌표 개수 , MBTI 개수(MBTI + 유형)]

In [None]:
vectorizer=TfidfVectorizer( max_features=5000,stop_words='english',tokenizer=Lemmatizer())
vectorizer.fit(clean)

In [None]:
X = vectorizer.fit_transform(clean)

In [None]:
maybe = X.toarray()

In [None]:
final = list(zip(maybe, img_count, imo_count, exclamation_count_column, mbti_count_column))
print(final[0])

In [None]:
clean

------------------

# Visualization

In [None]:
# 데이터 불러오기
data = pd.read_csv(fileName)
data.head()

In [None]:
#타입에 따른 개수
data['type'].value_counts()

In [None]:
# 데이터 정보
data.info()

------------------

# #1 words_per_comment

In [None]:
d = data
def var_row(row):
    l = []
    for i in row.split('|||'):
        l.append(len(i.split()))
    return np.var(l)
d['words_per_comment'] = d['posts'].apply(lambda x: len(x.split())/50)
d['variance_of_word_counts'] = d['posts'].apply(lambda x: var_row(x))

d.head()

In [None]:
plt.figure(figsize=(15,10))
sns.swarmplot("type", "words_per_comment", data=data)

## 전처리한 데이터를 기준으로 words per comment : 말이 긴가 짧은가 -> 구분선 없어서 불가

In [None]:
mylen = np.vectorize(len)
clean_len = mylen(clean)
clean_len/50

In [None]:
data = pd.read_csv(fileName)
# def var_row(row):
#     l = []
#     for i in row.split('|||'):
#         l.append(len(i.split()))
#     return np.var(l)
data['words_per_comment'] = clean_len/50
data['variance_of_word_counts'] = data['posts'].apply(lambda x: var_row(x))
data['posts'] = clean

data.head()

In [None]:
plt.figure(figsize=(15,10))
sns.swarmplot("type", "words_per_comment", data=data)

## 유형별 이미지 올리는 빈도

In [None]:
data = pd.read_csv(fileName)
new_img_count = [i/50 for i in img_count]
new_imo_count = [i/50 for i in imo_count]

In [None]:
data['words_per_comment'] = clean_len/50
data['variance_of_word_counts'] = data['posts'].apply(lambda x: var_row(x))
data['clean'] = clean
data['img_count'] = new_img_count
data['imo_count'] = new_imo_count
#data['img_count'] = img_count

In [None]:
data.head()

-----------

# #3

In [None]:
plt.figure(figsize=(15,10))
sns.jointplot("variance_of_word_counts", "words_per_comment", data=data, kind="hex")

In [None]:
df_2 = data[~data['type'].isin(['ESFJ','ESFP','ESTJ','ESTP'])]
df_2['http_per_comment'] = df_2['posts'].apply(lambda x: x.count('http')/50)
df_2['qm_per_comment'] = df_2['posts'].apply(lambda x: x.count('?')/50)
df_2.head()

In [None]:
data['http_per_comment'] = data['posts'].apply(lambda x: x.count('http')/50)
data['qm_per_comment'] = data['posts'].apply(lambda x: x.count('?')/50)
data['hm_per_comment'] = data['posts'].apply(lambda x: x.count('!')/50)
data.head()

--------------------

## 차례로 image, http, ?, ! 개수에 대한 출력

#### 이미지

In [None]:
# print(df_2.groupby('type').agg({'img_count': 'mean'}))
print(data.groupby('type').agg({'img_count': 'mean'}))

In [None]:
plt.figure(figsize=(12,4))
sns.barplot(data['type'], img_count, alpha=0.8)
plt.show()

#### http

In [None]:
# print(df_2.groupby('type').agg({'http_per_comment': 'mean'}))
print(data.groupby('type').agg({'http_per_comment': 'mean'}))

In [None]:
plt.figure(figsize=(12,4))
sns.barplot(data['type'], data.http_per_comment, alpha=0.8)
plt.show()

#### 물음표

In [None]:
# print(df_2.groupby('type').agg({'qm_per_comment': 'mean'}))
print(data.groupby('type').agg({'qm_per_comment': 'mean'}))

In [None]:
plt.figure(figsize=(12,4))
sns.barplot(data['type'], data.qm_per_comment, alpha=0.8)
plt.show()

#### 느낌표

In [None]:
print(data.groupby('type').agg({'hm_per_comment': 'mean'}))

In [None]:
plt.figure(figsize=(12,4))
sns.barplot(data['type'], data.hm_per_comment, alpha=0.8)
plt.show()

#### 이모티콘

In [None]:
print(data.groupby('type').agg({'imo_count': 'mean'}))

In [None]:
plt.figure(figsize=(12,4))
sns.barplot(data['type'], imo_count, alpha=0.8)
plt.show()

-----------------------

In [None]:
def plot_jointplot(mbti_type, axs, titles):
    df_3 = data[data['type'] == mbti_type]
    sns.jointplot("variance_of_word_counts", "words_per_comment", data=df_3, kind="hex", ax = axs, title = titles)
    
i = df_2['type'].unique()
k = 0
for m in range(0,2):
    for n in range(0,6):
        df_3 = data[data['type'] == i[k]]
        #sns.jointplot("variance_of_word_counts", "words_per_comment", data=df_3, kind="hex")
        plt.title(i[k])
        k+=1

# #8

## MBTI 4개 요소에 대해 0/1로 구분
##### Using the above code, if a person has I, N, T and J, the value across the 4 axis of MBTI i.e. IE, NS, TF and JP respectively, will be 1. Else 0.

In [None]:
def get_types(row):
    t=row['type']

    I = 0; N = 0
    T = 0; J = 0
    
    if t[0] == 'I': I = 1
    elif t[0] == 'E': I = 0
    else: print('I-E not found') 
        
    if t[1] == 'N': N = 1
    elif t[1] == 'S': N = 0
    else: print('N-S not found')
        
    if t[2] == 'T': T = 1
    elif t[2] == 'F': T = 0
    else: print('T-F not found')
        
    if t[3] == 'J': J = 1
    elif t[3] == 'P': J = 0
    else: print('J-P not found')
    return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J }) 

d_new = data.join(data.apply (lambda row: get_types (row),axis=1))
d_new.head(5)

##### This will help us calculate for e.g. how many Introvert posts are present v/s how many Extrovert posts are presnt, out of all the given entries in our labelled Kaggle dataset. This is done in order to extplore the dataset for all the individual Personality Indices of MBTI

## Counting No. of posts in one class / Total no. of posts in the other class

In [None]:
print ("Introversion (I) /  Extroversion (E):\t", d_new['IE'].value_counts()[0], " / ", d_new['IE'].value_counts()[1])
print ("Intuition (N) / Sensing (S):\t\t", d_new['NS'].value_counts()[0], " / ", d_new['NS'].value_counts()[1])
print ("Thinking (T) / Feeling (F):\t\t", d_new['TF'].value_counts()[0], " / ", d_new['TF'].value_counts()[1])
print ("Judging (J) / Perceiving (P):\t\t", d_new['JP'].value_counts()[0], " / ", d_new['JP'].value_counts()[1])

##### We infer that there is unequal distribution even among each of the 4 axis in the entries of out dataset.
i.e. out of IE:E is the majority, in NS:S is the majority. While TF and JP have realtively less differnce between them.

In [None]:
#Plotting the distribution of each personality type indicator
N = 4
bottom = (d_new['IE'].value_counts()[0], d_new['NS'].value_counts()[0], d_new['TF'].value_counts()[0], d_new['JP'].value_counts()[0])
top = (d_new['IE'].value_counts()[1], d_new['NS'].value_counts()[1], d_new['TF'].value_counts()[1], d_new['JP'].value_counts()[1])

ind = np.arange(N)    # the x locations for the groups
# the width of the bars
width = 0.7           # or len(x) can also be used here

p1 = plt.bar(ind, bottom, width, label="I, N, T, F")
p2 = plt.bar(ind, top, width, bottom=bottom, label="E, S, F, P") 

plt.title('Distribution accoss types indicators')
plt.ylabel('Count')
plt.xticks(ind, ('I / E',  'N / S', 'T / F', 'J / P',))
plt.legend()

plt.show()

## Features Correlation Analysis

In [None]:
d_new[['IE','NS','TF','JP']].corr()

##### It is unclear if the matrix shows anything valuable for interpretation
### An assumption made in our model is that each letter type is independent of other types
i.e. A person’s introversion/extroversion is not related to their judgement/perception. Nevertheless, we want to still test them below using a heat map

In [None]:
cmap = plt.cm.RdBu
corr = d_new[['IE','NS','TF','JP']].corr()
plt.figure(figsize=(12,10))
plt.title('Features Correlation Heatmap', size=15)
sns.heatmap(corr, cmap=cmap,  annot=True, linewidths=1)

From this heatmap also, it is unclear if it shows anything valuable for interpretation

------------------

# #13

##### The 2 histogram plots represent Gaussian distribution of a sample space,
##### Comprises of no. of words per comment and associated variance of word counts from our dataset.


### (1) 3번과 거의 동일한 그래프

In [None]:
plt.figure(figsize=(30,25))
sns.set(style="white", color_codes=True) # suitable theme for jointplot
sns.jointplot("variance_of_word_counts", "words_per_comment", data=data, alpha=0.7)
plt.show()

##### In the hexagonal plot, the hexagon with most number of points gets darker color.
##### Most of the posts have words between 100 and 150 and most of no. of words per comment by a user is between 25-30.

### (2) 

In [None]:
fig = px.density_heatmap(data, x="variance_of_word_counts", y="words_per_comment", marginal_x="box", marginal_y="violin")
fig.show()

##### There is no correlation observed between variance of word count and the words per comment.
##### There is a strong relationship when there are 25-30 words per comment & the variance of word counts is 100-150

----------------

# 덤

## Total post type of personality type

In [None]:
import  plotly.express  as px
fig = px.histogram(data, x="type",y="posts",histfunc = "count",
                   title='Total posts for each personality type',
                   labels={'ptype':'Personality types','posts':'No. of posts available'}, # can specify one label per df column
                   opacity=0.8,
                   #color_discrete_sequence=px.colors.sequential.deep # color of histogram bars
                   color_discrete_sequence=['navy']
                   )
fig.show()

## 위랑 같은건데 저~ 위의거랑 통일성 겸

In [None]:
plt.figure(figsize=(12,4))
sns.countplot(data['type'],data = data, alpha=0.8)
plt.show()