# Module Import 

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

import nltk
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.sklearn

import re
import string
import glob
import warnings

fontpath = '/usr/share/fonts/truetype/nanum/Nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=10)
plt.style.use('seaborn-white')
plt.rc('font', family='NanumBarunGothic')
plt.rcParams['figure.figsize'] = [20, 10]
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

%config InlineBackend.figure_format = 'retina'

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jihoonmanse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jihoonmanse/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /home/jihoonmanse/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


# Count Data Load(cnt_data)

In [2]:
cnt_data = pd.read_excel('/project/Roblox/kseung/apr/top10_wordcount.xlsx', index_col = 0)
tfidf_data = pd.read_excel('/project/Roblox/kseung/apr/top10_tfidf.xlsx', index_col = 0)
fisher_data = pd.read_excel('/project/Roblox/kseung/apr/top10_fisher.xlsx', index_col=0)


In [3]:
cnt_data

Unnamed: 0,Adopt Me!,MeepCity,Tower of Hell,Piggy,Royale High,Jailbreak,Murder Mystery 2,Welcome to Bloxburg,Pizza Factory Tycoon,Restaurant Tycoon
0,trade,bro,jump,piggy,guy,guy,guy,guy,pizza,restaurant
1,pet,house,guy,yeah,time,wait,murderer,yeah,yeah,yeah
2,guy,guy,yeah,guy,outfit,cop,wait,wait,buy,cook
3,people,yeah,wait,god,diamond,yeah,sheriff,pizza,wait,guy
4,wait,wait,time,wait,wait,car,yeah,house,guy,money
5,grandpa,party,easy,key,high,arrest,dude,room,customer,table
6,neon,meat,game,shovel,cute,police,kill,time,money,buy
7,yeah,game,god,skin,dorm,card,god,eat,cheese,people
8,feel,play,buy,die,yeah,key,bro,girl,cool,chef
9,dragon,cute,level,play,start,store,gun,baby,bro,thing


In [4]:
tfidf_data

Unnamed: 0,Adopt Me!,MeepCity,Tower of Hell,Piggy,Royale High,Jailbreak,Murder Mystery 2,Welcome to Bloxburg,Pizza Factory Tycoon,Restaurant Tycoon
0,trade,bro,jump,piggy,guy,arrest,murderer,guy,pizza,restaurant
1,pet,meep,guy,carrot,dorm,cop,sheriff,yeah,customer,cook
2,grandpa,meat,yeah,yeah,time,police,guy,wait,yeah,chef
3,neon,party,tower,key,diamond,guy,innocent,pizza,buy,table
4,guy,city,checkpoint,shovel,perry,wait,joe,room,wait,waiter
5,potion,chloe,wait,guy,outfit,card,wait,blocksburg,guy,yeah
6,offer,house,time,god,wait,criminal,chroma,house,cheese,guy
7,shadow,guy,easy,wait,high,car,murder,bella,money,money
8,adopt,yeah,god,skin,skirt,rob,yeah,time,pepperoni,buy
9,people,wait,game,infection,cute,yeah,knife,food,milkshake,customer


In [5]:
fisher_data

Unnamed: 0,Adopt Me!,MeepCity,Tower of Hell,Piggy,Royale High,Jailbreak,Murder Mystery 2,Welcome to Bloxburg,Pizza Factory Tycoon,Restaurant Tycoon
0,trade,meat,jump,piggy,dorm,cop,murderer,blocksburg,pizza,restaurant
1,pet,party,easy,shovel,diamond,arrest,sheriff,bella,customer,cook
2,grandpa,meep,tower,key,outfit,police,joe,birthday,cheese,table
3,neon,city,checkpoint,carrot,perry,card,knife,mom,pepperoni,waiter
4,offer,chloe,double,skin,hair,criminal,innocent,pancake,milkshake,chef
5,dragon,fish,level,infection,high,rob,murder,excite,factory,order
6,potion,bro,beat,mode,routine,jail,chroma,baby,buy,sushi
7,mega,house,cube,trap,skirt,bank,gun,babe,sausage,dish
8,shadow,class,rainbow,god,school,car,godly,room,dessert,spaghetti
9,adopt,teacher,invincibility,snow,class,officer,shoot,shower,banana,money


# Data Load

In [6]:
path = '/data/Roblox/'

In [7]:
lst_dir = glob.glob(path + '/*')

In [8]:
lst_col = []
for i in lst_dir:
    lst_col.append(i.split('/')[-1])

df = pd.DataFrame(columns=lst_col)
df

Unnamed: 0,Piggy,Adopt Me!,Welcome to Bloxburg,Murder Mystery 2,Jailbreak,Royale High,Restaurant Tycoon,Tower of Hell,Pizza Factory Tycoon,MeepCity


In [9]:
lst_data = []
for directory in lst_dir:
    print('=' * 50)
    print('게임 이름: {}'.format(directory.split('/')[-1]))
    print('=' * 50)
    lst_path = glob.glob(directory + '/*')

    lst_text = []
    for txt_path in lst_path:
        print(txt_path)
        with open(txt_path, 'r') as f:
            text = f.read()
        text = [text]
        lst_text += text

    df[directory.split('/')[-1]] = lst_text

게임 이름: Piggy
/data/Roblox/Piggy/Piggy Season 6 WHISTLING WINTER! (Meisery Skin).txt
/data/Roblox/Piggy/Piggy Winter Holiday!  Roblox.txt
/data/Roblox/Piggy/HOW TO GET SNOWPIGGY TRAP in PIGGY WINTER HOLIDAY EVENT MAP.txt
/data/Roblox/Piggy/SECRET PRIMROSE SKIN! ROBLOX PIGGY WINTER HOLIDAY HUNT.txt
/data/Roblox/Piggy/Escape The Christmas Winter Holiday Piggy Roblox Map  Cookie Swirl C.txt
/data/Roblox/Piggy/ROBLOX PIGGY @ the MALL!  Chapter 10 FGTeeV Multiplayer Escape (The Secret is Out).txt
/data/Roblox/Piggy/ZIZZY IS BACK FOR CHRISTMAS!! - Roblox Piggy Winter Holiday Chapter.txt
/data/Roblox/Piggy/ROBLOX PIGGY elsa bunny and meisery.txt
/data/Roblox/Piggy/THE END of Roblox PIGGY!.txt
/data/Roblox/Piggy/Roblox PIGGY... but with 100 PLAYERS.txt
게임 이름: Adopt Me!
/data/Roblox/Adopt Me!/DON'T TAKE ME AWAY FROM MY DAD!! SHE WON'T STOP FOLLOWING ME! ROBLOX ADOPT ME! (Roblox Roleplay).txt
/data/Roblox/Adopt Me!/Starting on a NEW Account (Part 3 Adopt me).txt
/data/Roblox/Adopt Me!/I Traded FL

In [10]:
df = df[['Adopt Me!', 'MeepCity', 'Tower of Hell', 'Piggy', 'Royale High',
         'Jailbreak', 'Murder Mystery 2', 'Welcome to Bloxburg', 'Pizza Factory Tycoon', 'Restaurant Tycoon']]

In [11]:
df

Unnamed: 0,Adopt Me!,MeepCity,Tower of Hell,Piggy,Royale High,Jailbreak,Murder Mystery 2,Welcome to Bloxburg,Pizza Factory Tycoon,Restaurant Tycoon
0,"Actually, you know what? I'm gonna give you a ...","Hey, guys, I'm in class right now. It's scienc...",Welcome to Roblox Rage Runner. Dan and then ra...,"Hey, guys. And today we're back onto another v...","Hey, Care bears. It's Hailey here. Welcome bac...",Never catch me. I'm moving. It doesn't have yo...,"Wait, don't die. Oh, no. Good job. Go get him....",Are you guys ready for a build challenge? No. ...,It's the evil girls that try to stab us. She s...,So I always wanted to own a five star restaura...
1,"Alright, guys, welcome back to another video. ...","Yo, what's up, guys? Your boy. Alpha One here....","No, but it doesn't make sense because it shows...","Hey, guys, it's Kate and Janet. Welcome back t...","Hello, everyone, and welcome to my channel. To...","CookieSwirl. See? Hello, Cookie fans. Today I ...",Don't forget to use star code Ant when you buy...,"Guys, we have exciting news. Woo. What is it? ...","What is up, my family? We are back, and we are...","Hey, guys. It's Korean. Tarak inside. Roblox. ..."
2,ABC for somebody to give me a fly ride. A ride...,"Hello, everyone. Welcome to another video by m...","Hey, guys, it's Alex and Zach and Drake. And w...",I got the snowpiggy trap and I'm going to show...,"Good morning, guys. Today I thought it would b...",So today we're gonna be doing a special challe...,Leave a like and subscribe in the next 3.2 sec...,"Hey guys, it's Krie turn back. It's at tomorro...","Should I do it? Should I do it? Hey, yo, what'...","Alright. Good morning, everybody. And today is..."
3,I made a brand new account that has absolutely...,"Oh, my Lord. Don't I just look like the bigges...","Oh, girl. Oh, girl. Oh my God. Why is it going...","Piggy book two. Winter holiday hunt. Oh, we go...","Hey, my little puppies puppet girl. Here buckn...","Hi, I'm Tankfish, and I'm a convicted war crim...","Hey guys, today we're gonna be stabbing people...","Welcome back to the channel, guys. Today we ha...","I'm about to become the Hulk. But what the oh,...","\nMMM stars. Guys, what are you doing? Are you..."
4,Hello everybody. Welcome back to my channel. A...,"Hey, guys. Welcome back to my channel. Last we...",Okay. Dom Q. Please don't queue. Please. I don...,"Cookies world. See? Hey, cookie fans. Welcome ...",Yes. I'm so happy you are here. I hope you enj...,I'm gonna quickly phase through the wall reall...,"Since I have no friends in Quarantine, I decid...","Oh, guys, I was just taking the longest nap in...","In this video, we unlock every single thing yo...","Hey guys, my name is Steviak and welcome to Su..."
5,"Oh, my gosh. Oh, man. Here we go, folks. Here ...","Hello, and today I am Haymich City, and I want...","Today, I get to do something that I've been wa...","Yo, where is everybody? Hello. Anybody else sh...","Hey, everyone. Welcome back to another video v...","Dude, it's 03:00 A.m.. We're not supposed to p...",So I am the murderer two times in a row. Alrig...,"Hey, guys, it's Janet and Kate. Welcome back t...","Oh, wow, that felt great. Hey, everybody. Welc...",My restaurant. Big games. Big I think big game...
6,Whoa. It's actually a girl. You literally can'...,Look at this man. Look at this man. He's a pro...,"Hey, everyone, it's your friend Think noodles....","What's up, guys? It's gravy. And today we're p...","Bye. Draco, draco. Draco. What happened? There...","I could transform into Donald Trump. Oh, what ...","Hey, get down from there, you fake weirdo. You...","So today in Bloxburg, my boyfriend and I staye...","Hey, guys. Jen what's happening, dude? And wel...","Hey, guys, it's Gravy, and I got a special gue..."
7,"Here, I'll be a really, really good guy and I'...","What's up, guys? Hello. How y'all going on thi...","What's going on, Jake? Who. Game words. Welcom...",What is that? Is that an ant? This is an ant. ...,Welcome back to the Valentina Diva semifinal. ...,He wants to switch teeth. \nWhat? Juan. I drov...,Can you not point that gun on me? It makes me ...,Guys. The game that I helped make survive Albe...,"Cookies. Whirl. See? Hello, chocolatey chippy ...","\nHi, guys. It's Elite, and I'm back with anot..."
8,Cookies. Whirl. See me. Whoa. It looks like I'...,"Hello people. It is me, Master Zubbie here. An...","Excuse me, sir. Put some clothes on. Look how ...","Hey, everyone, it's your friend at Think noodl...","Oh, my God. Yeah. So you might be wondering, w...","Yo, let's go. Alpha alexa. \nYou just fell on ...","Filing sorry, filing cabinet. Crate rock. So L...",Amber? Yeah. Come in here a SEC. I can't make ...,"One, two, three. Go, go. Know how you're faste...",Today we're gonna spend a bit of robots to get...
9,"So what should we do today? I mean, I feel lik...","Hey, guys. Welcome back to the channel. My nam...","As you know, I'm really good at parcourt, so I...","Piggy, but it's a hundred players. This is gon...",Gayer encloser master headphones and hello Jen...,So I was playing Roblox jailbreak with a fan w...,How is it going? Databases? No data here. And ...,"And man, this video's crazy. This one girl, sh...","Hello, everyone, and welcome back to on the Vo...",In today's Roblox video. I am the owner of my ...


# Preprocessing

## 결과 파일 생성

In [12]:
lst_index = []

for col in df.columns:
    for i in range(1, 11):
        index = '{}_{}'.format(col, i)
        lst_index.append(index)

result = pd.concat([df['Adopt Me!'], df['MeepCity'], df['Tower of Hell'], df['Piggy'], df['Royale High'], df['Jailbreak'], 
                    df['Murder Mystery 2'], df['Welcome to Bloxburg'], df['Pizza Factory Tycoon'], df['Restaurant Tycoon']], axis=0)

result.index = lst_index
result = pd.DataFrame(result, columns=['original'])

In [13]:
result

Unnamed: 0,original
Adopt Me!_1,"Actually, you know what? I'm gonna give you a ..."
Adopt Me!_2,"Alright, guys, welcome back to another video. ..."
Adopt Me!_3,ABC for somebody to give me a fly ride. A ride...
Adopt Me!_4,I made a brand new account that has absolutely...
Adopt Me!_5,Hello everybody. Welcome back to my channel. A...
...,...
Restaurant Tycoon_6,My restaurant. Big games. Big I think big game...
Restaurant Tycoon_7,"Hey, guys, it's Gravy, and I got a special gue..."
Restaurant Tycoon_8,"\nHi, guys. It's Elite, and I'm back with anot..."
Restaurant Tycoon_9,Today we're gonna spend a bit of robots to get...


## 불용어

In [14]:
stop_words = ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na",
              "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"]

dic_slang = {
    'A': ['arse', 'arsehead', 'ass', 'asshole'],
    'B': ['bastard', 'bitch', 'bloody', 'bollocks', 'brotherfucker', 'bugger', 'bullshit'],
    'C': ['child-fucker', 'Christ on a bike', 'Christ on a cracker', 'cock', 'cocksucker', 'crap', 'cunt'],
    'D': ['damn', 'damn it', 'dick', 'dickhead', 'dyke'],
    'F': ['fatherfucker', 'fuck', 'frigger'],
    'G': ['goddamn', 'godsdamn'],
    'H': ['hell', 'holy shit', 'horeseshit'],
    'I': ['in shit'],
    'J': ['Jesus Christ', 'Jesus fuck', 'Jesus H. Christ', 'Jesus Harold Christ', 'Jesus wept', 'Jesus, Mary and Joseph'],
    'K': ['kike'],
    'M': ['motherfucker'],
    'N': ['nigga', 'nigra'],
    'P': ['piss', 'prick', 'pussy'],
    'S': ['shit', 'shit ass', 'shite', 'sisterfucker', 'slut', 'son of a bitch', 'son of a whore', 'spastic'],
    'T': ['turd', 'twat'],
    'W': ['wanker']
}

for key in dic_slang:
    stop_words += dic_slang[key]

In [15]:
stop_words[-20:]

['Jesus wept',
 'Jesus, Mary and Joseph',
 'kike',
 'motherfucker',
 'nigga',
 'nigra',
 'piss',
 'prick',
 'pussy',
 'shit',
 'shit ass',
 'shite',
 'sisterfucker',
 'slut',
 'son of a bitch',
 'son of a whore',
 'spastic',
 'turd',
 'twat',
 'wanker']

### 작은 따옴표(')를 제외한 나머지 특수 문자 제거

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
symbols = string.punctuation.replace("'", "")
symbols

'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
for i in range(len(stop_words)):
    for symbol in symbols:
        stop_words[i] = stop_words[i].replace(symbol, "")

In [19]:
stop_words[-20:]

['Jesus wept',
 'Jesus Mary and Joseph',
 'kike',
 'motherfucker',
 'nigga',
 'nigra',
 'piss',
 'prick',
 'pussy',
 'shit',
 'shit ass',
 'shite',
 'sisterfucker',
 'slut',
 'son of a bitch',
 'son of a whore',
 'spastic',
 'turd',
 'twat',
 'wanker']

### 소문자화

In [20]:
for i in range(len(stop_words)):
    stop_words[i] = stop_words[i].lower()

In [21]:
stop_words[-20:]

['jesus wept',
 'jesus mary and joseph',
 'kike',
 'motherfucker',
 'nigga',
 'nigra',
 'piss',
 'prick',
 'pussy',
 'shit',
 'shit ass',
 'shite',
 'sisterfucker',
 'slut',
 'son of a bitch',
 'son of a whore',
 'spastic',
 'turd',
 'twat',
 'wanker']

### 원형 복원
- n: nouns
- v: verbs
- a: adjectives
- r: adverbs
- s: satellite adjectives.

In [22]:
lm = WordNetLemmatizer()

stop_words = [lm.lemmatize(w, pos='n') for w in stop_words]
stop_words = [lm.lemmatize(w, pos='v') for w in stop_words]
stop_words = [lm.lemmatize(w, pos='a') for w in stop_words]
stop_words = [lm.lemmatize(w, pos='r') for w in stop_words]
stop_words = [lm.lemmatize(w, pos='s') for w in stop_words]
print('원형 복원 전 불용어 수: {}'.format(len(stop_words)))

stop_words = set(stop_words)
print('원형 복원 후 불용어 수: {}'.format(len(stop_words)))

원형 복원 전 불용어 수: 1216
원형 복원 후 불용어 수: 1089


## document

### 작은 따옴표(')를 제외한 나머지 특수 문자, 줄바꿈, 공백 제거

In [23]:
def preprocessing(text):
    # 작은 따옴표(')를 제외한 나머지 특수 문자, 줄바꿈, 공백 제거
    for symbol in symbols:
        text = text.replace(symbol, "")
        text = text.replace('\n', "")
        text = text.strip()
        
    # 소문자화
    text = text.lower()
    
    # 원형 복원
    lm = WordNetLemmatizer()
    word_token = text.split()
    word_token = [lm.lemmatize(w, pos='n') for w in word_token]
    word_token = [lm.lemmatize(w, pos='v') for w in word_token]
    word_token = [lm.lemmatize(w, pos='a') for w in word_token]
    
    text = ' '.join(word_token)
    
    # 불용어 제거
    word_token = text.split(' ')
    
    lst_word = []
    for word in word_token:
        if word not in stop_words:
            lst_word.append(word)
            
    text = ' '.join(lst_word) + ' '
    
    ## 숫자제거 
    remove_number = re.compile(r'[0-9]')
    text = remove_number.sub('', text)
    
    ## 길이가 하나인 단어들 제거
    short_word = re.compile(r'\W*\b\w{1}\b')
    text = short_word.sub('', text)
    
    return text

In [24]:
df_video = df.applymap(preprocessing)

In [25]:
df_video

Unnamed: 0,Adopt Me!,MeepCity,Tower of Hell,Piggy,Royale High,Jailbreak,Murder Mystery 2,Welcome to Bloxburg,Pizza Factory Tycoon,Restaurant Tycoon
0,gonna tomato cheese pizza sound gross money ha...,hey guy class science class smack thing talk t...,roblox rage runner dan rage anger crew member ...,hey guy today video today gonna play piggy boo...,hey care bear hailey raw video channel subscri...,catch doughnut whoa wait money guy episode bon...,wait die job charlie guy episode lego vodka to...,guy ready build challenge yeah wait scar liste...,evil girl stab pizza stab noobs gonna knife sh...,star restaurant bunch rich people money today ...
1,alright guy video episode account gonna play g...,yo guy boy alpha banger video guy guy today vi...,sense red extra gravity bounce level accountab...,hey guy kate janet vlogs video channel play to...,channel today gonna spend lot diamond roha lot...,cookieswirl cookie fan today police officer ro...,forget star code ant buy robux mobile platform...,guy excite news woo squad merches nowshopithmi...,family pizza factory tycoon guy people play ...,hey guy korean tarak inside roblox today insid...
2,abc fly ride ride potion abc today gonna trade...,video vibri excite video play game blocksburg ...,hey guy alex zach drake twin jad charlie shut ...,snowpiggy trap channel lclc today snow piggy t...,morning guy today idea everyday routine real h...,today gonna special challenge chris mike ashle...,leave subscribe delinquent wake bed chance ar...,hey guy krie turn tomorrowblocks today inside ...,hey yo eat game video rome black sweet fruit h...,alright morning today day play minecraft today...
3,brand account absolutely gonna poor adopt play...,lord big picme check check lol nice stand nice...,girl girl god fast yeah wait low game guy audr...,piggy book winter holiday hunt stuff roblox pi...,hey puppy puppet girl buckner video today play...,tankfish convict war criminal suppose serve se...,hey guy today gonna stab people hospital kill ...,channel guy today officially blocksburg roomma...,hulk goodness ftt dude chase join today opener...,mmm star guy joke customer money completely st...
4,channel today gonna request video december chr...,hey guy channel week upload roblox video time ...,dom queue understand domcue hurt tower beat ho...,cooky hey cookie fan play piggy check winter u...,happy hope enjoy royal high role play video co...,gonna phase wall quick boy hacker arrest level...,friend quarantine decide edit roblox mm real l...,guy long nap blocksburg gosh gotta day swear ...,video unlock single thing unlock roblox pizza ...,hey guy steviak sushi tycoon roblox boy build ...
5,gosh man folk server fail bad ready cam star c...,today haymich city work routine chloe block vi...,today year play parkour tower type game cracky...,yo shop today scoop hoy love ice cream work to...,hey video video video outfit hack further ado ...,dude suppose play roblox baby sean disappear...,murderer time row alright ultimate alright guy...,hey guy janet kate roblox june play blocksburg...,wow felt great hey roblox adventure today play...,restaurant big game big big game suck suppose ...
6,whoa girl literally perfectly justin couple hi...,man man pro head person capture single thing p...,hey friend noodle roblox today hard roblox abb...,guy gravy today play winter holiday map piggy ...,bye draco draco draco girl ghoul zombie laugh ...,transform donald trump girl girl woman donald ...,hey fake weirdo fraud real antman avenger hey ...,today bloxburg boyfriend stay super fancy hote...,hey guy jen dude roadblock video play superher...,hey guy gravy special guest gravy karliman dag...
7,guy rare item game guy wait pet offensive adop...,guy'all marvelous night guess today roadblock...,jake game channel silla mad today play roblox ...,ant ant spidella misery business guy pg jones ...,valentina diva semifinal season valentina diva...,switch teeth juan drive drive donut truck leav...,point gun sweat lunar sweat boot luna dirty he...,guy game survive albert tie bite gotta start i...,cooky whirl chocolatey chippy cooky ready pizz...,guy elite video roadblock restaurant tycoon bu...
8,cooky whirl whoa baby sleep nice relax crib ad...,people master zubbie today review script basic...,excuse sir clothe concentrate wow play roblox ...,hey friend noodle roblox piggy time dislike vi...,god yeah remember long time ago start series b...,yo alpha alexa fell grind wow funny combo fung...,file file cabinet crate rock lcd upload yeah t...,amber yeah food urgent urgent snowstorm quick ...,fast catch lap boom play player pizza tycoon j...,today gonna spend bite robot restaurant restau...
9,today feel walk house eat kind bore leave hous...,hey guy channel cupcake today gonna play roblo...,parcourt gonna tower gonna easy professional r...,piggy player gonna suck gonna bad basically pi...,gay encloser master headphone jennifercom enco...,play roblox jailbreak fan forget robucks dad a...,database data excite episode murder mystery ex...,man video crazy girl twitch turn lad watch vid...,voblox video today play brand pizza factory co...,today roblox video owner restaurant scary guy ...


# 전략

해당데이터가 각각의 기준에 들어있는지 확인

In [26]:
cnt_data

Unnamed: 0,Adopt Me!,MeepCity,Tower of Hell,Piggy,Royale High,Jailbreak,Murder Mystery 2,Welcome to Bloxburg,Pizza Factory Tycoon,Restaurant Tycoon
0,trade,bro,jump,piggy,guy,guy,guy,guy,pizza,restaurant
1,pet,house,guy,yeah,time,wait,murderer,yeah,yeah,yeah
2,guy,guy,yeah,guy,outfit,cop,wait,wait,buy,cook
3,people,yeah,wait,god,diamond,yeah,sheriff,pizza,wait,guy
4,wait,wait,time,wait,wait,car,yeah,house,guy,money
5,grandpa,party,easy,key,high,arrest,dude,room,customer,table
6,neon,meat,game,shovel,cute,police,kill,time,money,buy
7,yeah,game,god,skin,dorm,card,god,eat,cheese,people
8,feel,play,buy,die,yeah,key,bro,girl,cool,chef
9,dragon,cute,level,play,start,store,gun,baby,bro,thing


In [27]:
len(set(cnt_data[cnt_data.columns[1:]].values.flatten()))

52

## 만들려고 하는 데이터프레임의 column이 되는 단어들 만들기 

## Wordcount

In [28]:
games_li = ['Adopt Me!', 'MeepCity', 'Tower of Hell', 'Piggy',
       'Royale High', 'Jailbreak', 'Murder Mystery 2', 'Welcome to Bloxburg',
       'Pizza Factory Tycoon', 'Restaurant Tycoon']

### wordount
cnt_word_li = []

for game in games_li :
    cnt_word_li.extend(cnt_data[game].values)

columns_words = sorted(list(set(cnt_word_li)))

In [29]:
dic={}
for game in games_li : 
    for ii in range(10):
        variable_name = game + '_' + str(ii)
        dic[variable_name] = []
        lst_text = df_video[game].loc[ii].split(' ')
        for column in columns_words :
            dic[variable_name].append(lst_text.count(column))

In [30]:
new_cnt_df = pd.DataFrame(dic, index=columns_words).T
new_cnt_df

Unnamed: 0,arrest,baby,bro,buy,car,card,cheese,chef,cook,cool,...,shovel,skin,start,store,table,thing,time,trade,wait,yeah
Adopt Me!_0,0,35,0,9,2,0,3,0,0,1,...,0,0,0,21,0,2,8,0,6,18
Adopt Me!_1,0,0,17,0,0,0,0,0,0,3,...,0,0,6,0,0,3,4,22,2,3
Adopt Me!_2,0,1,0,3,0,0,0,0,0,2,...,0,0,2,0,0,3,2,28,1,3
Adopt Me!_3,0,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,1,3,49,19,3
Adopt Me!_4,0,2,0,0,0,0,0,0,0,3,...,0,0,2,0,0,2,5,56,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Restaurant Tycoon_5,0,0,3,5,0,0,0,0,3,1,...,0,0,5,2,2,2,5,0,4,2
Restaurant Tycoon_6,0,0,2,50,1,0,2,14,30,2,...,0,0,0,2,14,11,3,1,16,29
Restaurant Tycoon_7,0,0,0,3,0,0,0,3,0,2,...,0,0,9,1,20,3,0,0,1,7
Restaurant Tycoon_8,0,0,0,5,0,0,1,13,3,0,...,0,0,2,2,12,4,3,0,8,3


In [31]:
# new_cnt_df.to_csv('wordcount_top10_for_tsne.csv')

## Fisher's exact Test

In [32]:
### fisher's exact test 
fisher_word_li = []

for game in games_li :
    fisher_word_li.extend(fisher_data[game].values)
columns_words = sorted(list(set(fisher_word_li)))

In [33]:
dic={}
for game in games_li : 
    for ii in range(10):
        variable_name = game + '_' + str(ii)
        dic[variable_name] = []
        lst_text = df_video[game].loc[ii].split(' ')
        for column in columns_words :
            dic[variable_name].append(lst_text.count(column))

In [34]:
new_fisher_df = pd.DataFrame(dic, index=columns_words).T
new_fisher_df

Unnamed: 0,adopt,arrest,babe,baby,banana,bank,beat,bella,birthday,blocksburg,...,skirt,snow,spaghetti,sushi,table,teacher,tower,trade,trap,waiter
Adopt Me!_0,14,0,0,35,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Adopt Me!_1,1,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,22,0,0
Adopt Me!_2,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,28,0,0
Adopt Me!_3,9,0,0,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,49,0,0
Adopt Me!_4,6,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,2,56,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Restaurant Tycoon_5,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,2,0,0,0,0,0
Restaurant Tycoon_6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,14,0,0,1,0,18
Restaurant Tycoon_7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,20,0,0,0,0,0
Restaurant Tycoon_8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,12,0,0,0,0,14


In [35]:
# new_fisher_df.to_csv('fisherexact_top10_for_tsne.csv')

## Tf-Idf

In [36]:
### Tf-Idf방법
tfidf_word_li = []

for game in games_li :
    tfidf_word_li.extend(tfidf_data[game].values)

columns_words = sorted(list(set(tfidf_word_li)))

In [37]:
dic={}
for game in games_li : 
    for ii in range(10):
        variable_name = game + '_' + str(ii)
        dic[variable_name] = []
        lst_text = df_video[game].loc[ii].split(' ')
        for column in columns_words :
            dic[variable_name].append(lst_text.count(column))

In [38]:
new_tfidfcount_df = pd.DataFrame(dic, index=columns_words).T
new_tfidfcount_df

Unnamed: 0,adopt,arrest,bella,blocksburg,bro,buy,car,card,carrot,checkpoint,...,shovel,skin,skirt,table,time,tower,trade,wait,waiter,yeah
Adopt Me!_0,14,0,0,0,0,9,2,0,0,0,...,0,0,0,0,8,0,0,6,0,18
Adopt Me!_1,1,0,0,0,17,0,0,0,0,0,...,0,0,0,0,4,0,22,2,0,3
Adopt Me!_2,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,2,0,28,1,0,3
Adopt Me!_3,9,0,0,0,0,1,0,0,0,0,...,0,0,0,0,3,0,49,19,0,3
Adopt Me!_4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,2,56,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Restaurant Tycoon_5,0,0,0,0,3,5,0,0,0,0,...,0,0,0,2,5,0,0,4,0,2
Restaurant Tycoon_6,0,0,0,0,2,50,1,0,0,0,...,0,0,0,14,3,0,1,16,18,29
Restaurant Tycoon_7,0,0,0,0,0,3,0,0,0,0,...,0,0,0,20,0,0,0,1,0,7
Restaurant Tycoon_8,0,0,0,0,0,5,0,0,0,0,...,0,0,0,12,3,0,0,8,14,3


In [39]:
# new_tfidfcount_df.to_csv('tfidf_top10_for_tsne.csv')