In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.sklearn

import re
import string
import glob
import warnings

fontpath = '/usr/share/fonts/truetype/nanum/Nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=10)
plt.style.use('seaborn-white')
plt.rc('font', family='NanumBarunGothic')
plt.rcParams['figure.figsize'] = [20, 10]
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

%config InlineBackend.figure_format = 'retina'

[nltk_data] Downloading package stopwords to /home/kseung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kseung/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /home/kseung/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


# Data Load

In [2]:
df_word = pd.read_csv('../../reduction_word.csv', index_col=0)
df_word

Unnamed: 0,word
0,purchase
1,kickback
2,hear
3,neon
4,spawn
...,...
1590,chicken
1591,icicle
1592,change
1593,robber


In [3]:
path = '/data/Roblox/'

In [4]:
lst_dir = glob.glob(path + '/*')

In [5]:
lst_dir

['/data/Roblox/Piggy',
 '/data/Roblox/Adopt Me!',
 '/data/Roblox/Welcome to Bloxburg',
 '/data/Roblox/Murder Mystery 2',
 '/data/Roblox/Jailbreak',
 '/data/Roblox/Royale High',
 '/data/Roblox/Restaurant Tycoon',
 '/data/Roblox/Tower of Hell',
 '/data/Roblox/Pizza Factory Tycoon',
 '/data/Roblox/MeepCity']

In [6]:
lst_col = []
for i in lst_dir:
    lst_col.append(i.split('/')[-1])

df = pd.DataFrame(columns=lst_col)
df

Unnamed: 0,Piggy,Adopt Me!,Welcome to Bloxburg,Murder Mystery 2,Jailbreak,Royale High,Restaurant Tycoon,Tower of Hell,Pizza Factory Tycoon,MeepCity


In [7]:
lst_data = []
for directory in lst_dir:
    print('=' * 50)
    print('게임 이름: {}'.format(directory.split('/')[-1]))
    print('=' * 50)
    lst_path = glob.glob(directory + '/*')

    lst_text = []
    for txt_path in lst_path:
        print(txt_path)
        with open(txt_path, 'r') as f:
            text = f.read()
        text = [text]
        lst_text += text

    df[directory.split('/')[-1]] = lst_text

게임 이름: Piggy
/data/Roblox/Piggy/Piggy Season 6 WHISTLING WINTER! (Meisery Skin).txt
/data/Roblox/Piggy/Piggy Winter Holiday!  Roblox.txt
/data/Roblox/Piggy/HOW TO GET SNOWPIGGY TRAP in PIGGY WINTER HOLIDAY EVENT MAP.txt
/data/Roblox/Piggy/SECRET PRIMROSE SKIN! ROBLOX PIGGY WINTER HOLIDAY HUNT.txt
/data/Roblox/Piggy/Escape The Christmas Winter Holiday Piggy Roblox Map  Cookie Swirl C.txt
/data/Roblox/Piggy/ROBLOX PIGGY @ the MALL!  Chapter 10 FGTeeV Multiplayer Escape (The Secret is Out).txt
/data/Roblox/Piggy/ZIZZY IS BACK FOR CHRISTMAS!! - Roblox Piggy Winter Holiday Chapter.txt
/data/Roblox/Piggy/ROBLOX PIGGY elsa bunny and meisery.txt
/data/Roblox/Piggy/THE END of Roblox PIGGY!.txt
/data/Roblox/Piggy/Roblox PIGGY... but with 100 PLAYERS.txt
게임 이름: Adopt Me!
/data/Roblox/Adopt Me!/DON'T TAKE ME AWAY FROM MY DAD!! SHE WON'T STOP FOLLOWING ME! ROBLOX ADOPT ME! (Roblox Roleplay).txt
/data/Roblox/Adopt Me!/Starting on a NEW Account (Part 3 Adopt me).txt
/data/Roblox/Adopt Me!/I Traded FL

In [8]:
df = df[['Adopt Me!', 'MeepCity', 'Tower of Hell', 'Piggy', 'Royale High',
         'Jailbreak', 'Murder Mystery 2', 'Welcome to Bloxburg', 'Pizza Factory Tycoon', 'Restaurant Tycoon']]

In [9]:
df

Unnamed: 0,Adopt Me!,MeepCity,Tower of Hell,Piggy,Royale High,Jailbreak,Murder Mystery 2,Welcome to Bloxburg,Pizza Factory Tycoon,Restaurant Tycoon
0,"Actually, you know what? I'm gonna give you a ...","Hey, guys, I'm in class right now. It's scienc...",Welcome to Roblox Rage Runner. Dan and then ra...,"Hey, guys. And today we're back onto another v...","Hey, Care bears. It's Hailey here. Welcome bac...",Never catch me. I'm moving. It doesn't have yo...,"Wait, don't die. Oh, no. Good job. Go get him....",Are you guys ready for a build challenge? No. ...,It's the evil girls that try to stab us. She s...,So I always wanted to own a five star restaura...
1,"Alright, guys, welcome back to another video. ...","Yo, what's up, guys? Your boy. Alpha One here....","No, but it doesn't make sense because it shows...","Hey, guys, it's Kate and Janet. Welcome back t...","Hello, everyone, and welcome to my channel. To...","CookieSwirl. See? Hello, Cookie fans. Today I ...",Don't forget to use star code Ant when you buy...,"Guys, we have exciting news. Woo. What is it? ...","What is up, my family? We are back, and we are...","Hey, guys. It's Korean. Tarak inside. Roblox. ..."
2,ABC for somebody to give me a fly ride. A ride...,"Hello, everyone. Welcome to another video by m...","Hey, guys, it's Alex and Zach and Drake. And w...",I got the snowpiggy trap and I'm going to show...,"Good morning, guys. Today I thought it would b...",So today we're gonna be doing a special challe...,Leave a like and subscribe in the next 3.2 sec...,"Hey guys, it's Krie turn back. It's at tomorro...","Should I do it? Should I do it? Hey, yo, what'...","Alright. Good morning, everybody. And today is..."
3,I made a brand new account that has absolutely...,"Oh, my Lord. Don't I just look like the bigges...","Oh, girl. Oh, girl. Oh my God. Why is it going...","Piggy book two. Winter holiday hunt. Oh, we go...","Hey, my little puppies puppet girl. Here buckn...","Hi, I'm Tankfish, and I'm a convicted war crim...","Hey guys, today we're gonna be stabbing people...","Welcome back to the channel, guys. Today we ha...","I'm about to become the Hulk. But what the oh,...","\nMMM stars. Guys, what are you doing? Are you..."
4,Hello everybody. Welcome back to my channel. A...,"Hey, guys. Welcome back to my channel. Last we...",Okay. Dom Q. Please don't queue. Please. I don...,"Cookies world. See? Hey, cookie fans. Welcome ...",Yes. I'm so happy you are here. I hope you enj...,I'm gonna quickly phase through the wall reall...,"Since I have no friends in Quarantine, I decid...","Oh, guys, I was just taking the longest nap in...","In this video, we unlock every single thing yo...","Hey guys, my name is Steviak and welcome to Su..."
5,"Oh, my gosh. Oh, man. Here we go, folks. Here ...","Hello, and today I am Haymich City, and I want...","Today, I get to do something that I've been wa...","Yo, where is everybody? Hello. Anybody else sh...","Hey, everyone. Welcome back to another video v...","Dude, it's 03:00 A.m.. We're not supposed to p...",So I am the murderer two times in a row. Alrig...,"Hey, guys, it's Janet and Kate. Welcome back t...","Oh, wow, that felt great. Hey, everybody. Welc...",My restaurant. Big games. Big I think big game...
6,Whoa. It's actually a girl. You literally can'...,Look at this man. Look at this man. He's a pro...,"Hey, everyone, it's your friend Think noodles....","What's up, guys? It's gravy. And today we're p...","Bye. Draco, draco. Draco. What happened? There...","I could transform into Donald Trump. Oh, what ...","Hey, get down from there, you fake weirdo. You...","So today in Bloxburg, my boyfriend and I staye...","Hey, guys. Jen what's happening, dude? And wel...","Hey, guys, it's Gravy, and I got a special gue..."
7,"Here, I'll be a really, really good guy and I'...","What's up, guys? Hello. How y'all going on thi...","What's going on, Jake? Who. Game words. Welcom...",What is that? Is that an ant? This is an ant. ...,Welcome back to the Valentina Diva semifinal. ...,He wants to switch teeth. \nWhat? Juan. I drov...,Can you not point that gun on me? It makes me ...,Guys. The game that I helped make survive Albe...,"Cookies. Whirl. See? Hello, chocolatey chippy ...","\nHi, guys. It's Elite, and I'm back with anot..."
8,Cookies. Whirl. See me. Whoa. It looks like I'...,"Hello people. It is me, Master Zubbie here. An...","Excuse me, sir. Put some clothes on. Look how ...","Hey, everyone, it's your friend at Think noodl...","Oh, my God. Yeah. So you might be wondering, w...","Yo, let's go. Alpha alexa. \nYou just fell on ...","Filing sorry, filing cabinet. Crate rock. So L...",Amber? Yeah. Come in here a SEC. I can't make ...,"One, two, three. Go, go. Know how you're faste...",Today we're gonna spend a bit of robots to get...
9,"So what should we do today? I mean, I feel lik...","Hey, guys. Welcome back to the channel. My nam...","As you know, I'm really good at parcourt, so I...","Piggy, but it's a hundred players. This is gon...",Gayer encloser master headphones and hello Jen...,So I was playing Roblox jailbreak with a fan w...,How is it going? Databases? No data here. And ...,"And man, this video's crazy. This one girl, sh...","Hello, everyone, and welcome back to on the Vo...",In today's Roblox video. I am the owner of my ...


# Preprocessing

## 결과 파일 생성

In [10]:
lst_index = []

for col in df.columns:
    for i in range(1, 11):
        index = '{}_{}'.format(col, i)
        lst_index.append(index)

result = pd.concat([df['Adopt Me!'], df['MeepCity'], df['Tower of Hell'], df['Piggy'], df['Royale High'], df['Jailbreak'], 
                    df['Murder Mystery 2'], df['Welcome to Bloxburg'], df['Pizza Factory Tycoon'], df['Restaurant Tycoon']], axis=0)

result.index = lst_index
result = pd.DataFrame(result, columns=['original'])

In [11]:
result

Unnamed: 0,original
Adopt Me!_1,"Actually, you know what? I'm gonna give you a ..."
Adopt Me!_2,"Alright, guys, welcome back to another video. ..."
Adopt Me!_3,ABC for somebody to give me a fly ride. A ride...
Adopt Me!_4,I made a brand new account that has absolutely...
Adopt Me!_5,Hello everybody. Welcome back to my channel. A...
...,...
Restaurant Tycoon_6,My restaurant. Big games. Big I think big game...
Restaurant Tycoon_7,"Hey, guys, it's Gravy, and I got a special gue..."
Restaurant Tycoon_8,"\nHi, guys. It's Elite, and I'm back with anot..."
Restaurant Tycoon_9,Today we're gonna spend a bit of robots to get...


## 불용어

In [12]:
stop_words = ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na",
              "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"]

dic_slang = {
    'A': ['arse', 'arsehead', 'ass', 'asshole'],
    'B': ['bastard', 'bitch', 'bloody', 'bollocks', 'brotherfucker', 'bugger', 'bullshit'],
    'C': ['child-fucker', 'Christ on a bike', 'Christ on a cracker', 'cock', 'cocksucker', 'crap', 'cunt'],
    'D': ['damn', 'damn it', 'dick', 'dickhead', 'dyke'],
    'F': ['fatherfucker', 'fuck', 'frigger'],
    'G': ['goddamn', 'godsdamn'],
    'H': ['hell', 'holy shit', 'horeseshit'],
    'I': ['in shit'],
    'J': ['Jesus Christ', 'Jesus fuck', 'Jesus H. Christ', 'Jesus Harold Christ', 'Jesus wept', 'Jesus, Mary and Joseph'],
    'K': ['kike'],
    'M': ['motherfucker'],
    'N': ['nigga', 'nigra'],
    'P': ['piss', 'prick', 'pussy'],
    'S': ['shit', 'shit ass', 'shite', 'sisterfucker', 'slut', 'son of a bitch', 'son of a whore', 'spastic'],
    'T': ['turd', 'twat'],
    'W': ['wanker']
}

for key in dic_slang:
    stop_words += dic_slang[key]

In [13]:
stop_words[-20:]

['Jesus wept',
 'Jesus, Mary and Joseph',
 'kike',
 'motherfucker',
 'nigga',
 'nigra',
 'piss',
 'prick',
 'pussy',
 'shit',
 'shit ass',
 'shite',
 'sisterfucker',
 'slut',
 'son of a bitch',
 'son of a whore',
 'spastic',
 'turd',
 'twat',
 'wanker']

### 작은 따옴표(')를 제외한 나머지 특수 문자 제거

In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
symbols = string.punctuation.replace("'", "")
symbols

'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
for i in range(len(stop_words)):
    for symbol in symbols:
        stop_words[i] = stop_words[i].replace(symbol, "")

In [17]:
stop_words[-20:]

['Jesus wept',
 'Jesus Mary and Joseph',
 'kike',
 'motherfucker',
 'nigga',
 'nigra',
 'piss',
 'prick',
 'pussy',
 'shit',
 'shit ass',
 'shite',
 'sisterfucker',
 'slut',
 'son of a bitch',
 'son of a whore',
 'spastic',
 'turd',
 'twat',
 'wanker']

### 소문자화

In [18]:
for i in range(len(stop_words)):
    stop_words[i] = stop_words[i].lower()

In [19]:
stop_words[-20:]

['jesus wept',
 'jesus mary and joseph',
 'kike',
 'motherfucker',
 'nigga',
 'nigra',
 'piss',
 'prick',
 'pussy',
 'shit',
 'shit ass',
 'shite',
 'sisterfucker',
 'slut',
 'son of a bitch',
 'son of a whore',
 'spastic',
 'turd',
 'twat',
 'wanker']

### 원형 복원
- n: nouns
- v: verbs
- a: adjectives
- r: adverbs
- s: satellite adjectives.

In [20]:
lm = WordNetLemmatizer()

stop_words = [lm.lemmatize(w, pos='n') for w in stop_words]
stop_words = [lm.lemmatize(w, pos='v') for w in stop_words]
stop_words = [lm.lemmatize(w, pos='a') for w in stop_words]
stop_words = [lm.lemmatize(w, pos='r') for w in stop_words]
stop_words = [lm.lemmatize(w, pos='s') for w in stop_words]
print('원형 복원 전 불용어 수: {}'.format(len(stop_words)))

stop_words = set(stop_words)
print('원형 복원 후 불용어 수: {}'.format(len(stop_words)))

원형 복원 전 불용어 수: 1216
원형 복원 후 불용어 수: 1089


## document

### 작은 따옴표(')를 제외한 나머지 특수 문자, 줄바꿈, 공백 제거

In [21]:
def remove_symbols(text):
    for symbol in symbols:
        text = text.replace(symbol, "")
        text = text.replace('\n', "")
        text = text.strip()
        
    return text

In [22]:
result['special symbols'] = result['original'].map(remove_symbols)

In [23]:
result

Unnamed: 0,original,special symbols
Adopt Me!_1,"Actually, you know what? I'm gonna give you a ...",Actually you know what I'm gonna give you a to...
Adopt Me!_2,"Alright, guys, welcome back to another video. ...",Alright guys welcome back to another video Thi...
Adopt Me!_3,ABC for somebody to give me a fly ride. A ride...,ABC for somebody to give me a fly ride A ride ...
Adopt Me!_4,I made a brand new account that has absolutely...,I made a brand new account that has absolutely...
Adopt Me!_5,Hello everybody. Welcome back to my channel. A...,Hello everybody Welcome back to my channel And...
...,...,...
Restaurant Tycoon_6,My restaurant. Big games. Big I think big game...,My restaurant Big games Big I think big games ...
Restaurant Tycoon_7,"Hey, guys, it's Gravy, and I got a special gue...",Hey guys it's Gravy and I got a special guest ...
Restaurant Tycoon_8,"\nHi, guys. It's Elite, and I'm back with anot...",Hi guys It's Elite and I'm back with another v...
Restaurant Tycoon_9,Today we're gonna spend a bit of robots to get...,Today we're gonna spend a bit of robots to get...


### 소문자화

In [24]:
def lower(text):
    text = text.lower()
    
    return text

In [25]:
result['lower'] = result['special symbols'].map(lower)

In [26]:
result

Unnamed: 0,original,special symbols,lower
Adopt Me!_1,"Actually, you know what? I'm gonna give you a ...",Actually you know what I'm gonna give you a to...,actually you know what i'm gonna give you a to...
Adopt Me!_2,"Alright, guys, welcome back to another video. ...",Alright guys welcome back to another video Thi...,alright guys welcome back to another video thi...
Adopt Me!_3,ABC for somebody to give me a fly ride. A ride...,ABC for somebody to give me a fly ride A ride ...,abc for somebody to give me a fly ride a ride ...
Adopt Me!_4,I made a brand new account that has absolutely...,I made a brand new account that has absolutely...,i made a brand new account that has absolutely...
Adopt Me!_5,Hello everybody. Welcome back to my channel. A...,Hello everybody Welcome back to my channel And...,hello everybody welcome back to my channel and...
...,...,...,...
Restaurant Tycoon_6,My restaurant. Big games. Big I think big game...,My restaurant Big games Big I think big games ...,my restaurant big games big i think big games ...
Restaurant Tycoon_7,"Hey, guys, it's Gravy, and I got a special gue...",Hey guys it's Gravy and I got a special guest ...,hey guys it's gravy and i got a special guest ...
Restaurant Tycoon_8,"\nHi, guys. It's Elite, and I'm back with anot...",Hi guys It's Elite and I'm back with another v...,hi guys it's elite and i'm back with another v...
Restaurant Tycoon_9,Today we're gonna spend a bit of robots to get...,Today we're gonna spend a bit of robots to get...,today we're gonna spend a bit of robots to get...


### 원형 복원

In [27]:
def lemmatization(text):
    word_token = text.split(' ')
    
    word_token = [lm.lemmatize(w, pos='n') for w in word_token]
    word_token = [lm.lemmatize(w, pos='v') for w in word_token]
    word_token = [lm.lemmatize(w, pos='a') for w in word_token]
    word_token = [lm.lemmatize(w, pos='r') for w in word_token]
    word_token = [lm.lemmatize(w, pos='s') for w in word_token]
    
    text = ' '.join(word_token)
    
    return text

In [28]:
result['lemmatization'] = result['lower'].map(lemmatization)

In [29]:
result

Unnamed: 0,original,special symbols,lower,lemmatization
Adopt Me!_1,"Actually, you know what? I'm gonna give you a ...",Actually you know what I'm gonna give you a to...,actually you know what i'm gonna give you a to...,actually you know what i'm gonna give you a to...
Adopt Me!_2,"Alright, guys, welcome back to another video. ...",Alright guys welcome back to another video Thi...,alright guys welcome back to another video thi...,alright guy welcome back to another video this...
Adopt Me!_3,ABC for somebody to give me a fly ride. A ride...,ABC for somebody to give me a fly ride A ride ...,abc for somebody to give me a fly ride a ride ...,abc for somebody to give me a fly ride a ride ...
Adopt Me!_4,I made a brand new account that has absolutely...,I made a brand new account that has absolutely...,i made a brand new account that has absolutely...,i make a brand new account that ha absolutely ...
Adopt Me!_5,Hello everybody. Welcome back to my channel. A...,Hello everybody Welcome back to my channel And...,hello everybody welcome back to my channel and...,hello everybody welcome back to my channel and...
...,...,...,...,...
Restaurant Tycoon_6,My restaurant. Big games. Big I think big game...,My restaurant Big games Big I think big games ...,my restaurant big games big i think big games ...,my restaurant big game big i think big game su...
Restaurant Tycoon_7,"Hey, guys, it's Gravy, and I got a special gue...",Hey guys it's Gravy and I got a special guest ...,hey guys it's gravy and i got a special guest ...,hey guy it's gravy and i get a special guest o...
Restaurant Tycoon_8,"\nHi, guys. It's Elite, and I'm back with anot...",Hi guys It's Elite and I'm back with another v...,hi guys it's elite and i'm back with another v...,hi guy it's elite and i'm back with another vi...
Restaurant Tycoon_9,Today we're gonna spend a bit of robots to get...,Today we're gonna spend a bit of robots to get...,today we're gonna spend a bit of robots to get...,today we're gonna spend a bite of robot to get...


## 전처리 전, 전처리 후 비교

In [30]:
def comparison(text1, text2):
    word_token1 = text1.split(' ')
    word_token2 = text2.split(' ')
    
    lst_result = word_token2
    
    for i in range(len(word_token1)):
        word1 = word_token1[i]
        word2 = word_token2[i]
    
        diff = len(word1) - len(word2)

        if word2 in stop_words:
            word2 = ('_' * len(word2))
            lst_result[i] = word2

        if word1 > word2:
            word2 += ('_' * diff)
            lst_result[i] = word2
        
    text = ' '.join(lst_result)
    
    return text

In [31]:
lst_text = []

for i in range(len(result)):
    text1 = result.iloc[i]['lower']
    text2 = result.iloc[i]['lemmatization']
    
    text = comparison(text1, text2)
    
    lst_text.append(text)

In [32]:
lst_text

["________ ___ ____ ____ ___ gonna ____ ___ _ tomato ___ cheese pizza ______ __ __ ____ sound_ gross _ _____ ____ ______ money ___ hawaii ____ __ twin sister hawaiian _____ hit ___ ____ __ sister _ hawaiian hawaiian ____ __ ___ life ____ chill___ __ _ baby carriage _____ _______ _____ ___ cut___ baby _ _____ ____ _______ __ adopt __ yeah ______ ______ ____ adopt __ _______ _____ twin_ ___ __ ___ __ cute ________ _ __ _______ ___ _ ___ baby __ adopt ____ __ ____ grandpa ____ __ ___ outfit __ _______ ____ match __ __ gosh _ hope __ _______ adopt __ __ ______ ____ ____ ___ ______ __ adopt _ ______ baby ______ ____ couch _______ stay ____ ____ __ _____ ____ __ ___ adopt__ __ _______ peepies __ ____ cradle ____ _____ _ cradle lady __ ____ ____ ___ __ ____ ____ one's ____ ____ _______ __ ___ funny ____ care __ ___ _ ______ _______ __ ___ _ ___ _______ __ ___ __ ___ _____ adopt __ _____ twin_ __ ___ ____ __ ___ ___ __ ____ ___ ___ _____ __ ugly ___ dare ___ sir ___ ____ ___ _ baby _ ___ baby 

In [33]:
result['comparison'] = lst_text

In [34]:
result

Unnamed: 0,original,special symbols,lower,lemmatization,comparison
Adopt Me!_1,"Actually, you know what? I'm gonna give you a ...",Actually you know what I'm gonna give you a to...,actually you know what i'm gonna give you a to...,actually you know what i'm gonna give you a to...,________ ___ ____ ____ ___ gonna ____ ___ _ to...
Adopt Me!_2,"Alright, guys, welcome back to another video. ...",Alright guys welcome back to another video Thi...,alright guys welcome back to another video thi...,alright guy welcome back to another video this...,alright guy_ _______ ____ __ _______ video ___...
Adopt Me!_3,ABC for somebody to give me a fly ride. A ride...,ABC for somebody to give me a fly ride A ride ...,abc for somebody to give me a fly ride a ride ...,abc for somebody to give me a fly ride a ride ...,abc ___ ________ __ ____ __ _ fly ride _ ride ...
Adopt Me!_4,I made a brand new account that has absolutely...,I made a brand new account that has absolutely...,i made a brand new account that has absolutely...,i make a brand new account that ha absolutely ...,_ ____ _ brand ___ account ____ ___ absolutely...
Adopt Me!_5,Hello everybody. Welcome back to my channel. A...,Hello everybody Welcome back to my channel And...,hello everybody welcome back to my channel and...,hello everybody welcome back to my channel and...,_____ _________ _______ ____ __ __ channel ___...
...,...,...,...,...,...
Restaurant Tycoon_6,My restaurant. Big games. Big I think big game...,My restaurant Big games Big I think big games ...,my restaurant big games big i think big games ...,my restaurant big game big i think big game su...,__ restaurant big game_ big _ _____ big game_ ...
Restaurant Tycoon_7,"Hey, guys, it's Gravy, and I got a special gue...",Hey guys it's Gravy and I got a special guest ...,hey guys it's gravy and i got a special guest ...,hey guy it's gravy and i get a special guest o...,hey guy_ ____ gravy ___ _ ___ _ special guest ...
Restaurant Tycoon_8,"\nHi, guys. It's Elite, and I'm back with anot...",Hi guys It's Elite and I'm back with another v...,hi guys it's elite and i'm back with another v...,hi guy it's elite and i'm back with another vi...,__ guy_ ____ elite ___ ___ ____ ____ _______ v...
Restaurant Tycoon_9,Today we're gonna spend a bit of robots to get...,Today we're gonna spend a bit of robots to get...,today we're gonna spend a bit of robots to get...,today we're gonna spend a bite of robot to get...,today _____ gonna spend _ bite __ robot_ __ __...


In [35]:
result.to_excel('result.xlsx')