## Compile slang

from [https://slangit.com/terms/social_media-all], [https://www.netlingo.com/acronyms.php]

In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import pickle

# Progress bar
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [26]:
def scrape_from_slangit(url):
    # Create an URL object
    # Create object page
    page = requests.get(url)

    # parser-lxml = Change html to Python friendly format
    # Obtain page's information
    soup = BeautifulSoup(page.text, 'lxml')

    # Obtain information from tag <table>
    table1 = soup.find('tbody')

    # Convert wrapped text in column 13 into one line text
    headers = ['slang','meaning']

    # Create a dataframe
    mydata = pd.DataFrame(columns = headers)
    
    # Create a for loop to fill mydata
    for j in table1.find_all('tr')[0:]:
        row_data = j.find_all('td')
        row = [i.text.strip() for i in row_data][:2]
        length = len(mydata)
        mydata.loc[length] = row

    #Create dataframe
    df=mydata.drop_duplicates(['slang'])
    return df.reset_index().drop(['index'], axis=1)

slang_social=scrape_from_slangit(url='https://slangit.com/terms/social_media-all')
slang_social

Unnamed: 0,slang,meaning
0,Meekd,Suppressed
1,LOL,Laughing out loud
2,mk,Okay
3,IRL,In real life
4,TTYL,Talk to you later
...,...,...
1361,FHO,Friends Hanging Out
1362,Gawd,God
1363,FIFAWWC,FIFA Women's World Cup
1364,DTN,Don't trust no one


In [27]:
def scrape_from_netlingo(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    slang_meanings =  []

    for span in soup.find_all('span'):
        slang = span.find('a')
        meaning = span.find_next_sibling('div', class_='explain')
        if slang and meaning:
            slang_meanings.append((slang.text.strip(), meaning.text.strip()))
    
    df = pd.DataFrame(slang_meanings, columns=['slang', 'meaning'])
    return df

df = scrape_from_netlingo("https://www.netlingo.com/acronyms.php")
df

Unnamed: 0,slang,meaning
0,!,I have a comment
1,#FF,Follow Friday
2,(U),"it means arms around you, hug for you"
3,*$,Starbucks
4,**//,"it means wink wink, nudge nudge"
...,...,...
2883,ZZZ,"Sleeping, Bored, Tired"
2884,\M/,Heavy Metal Music
2885,^5,High Five
2886,^RUP^,Read Up Please


In [42]:
slang_df = pd.concat([slang_social, df], ignore_index=True)
slang_df

Unnamed: 0,slang,meaning
0,Meekd,Suppressed
1,LOL,Laughing out loud
2,mk,Okay
3,IRL,In real life
4,TTYL,Talk to you later
...,...,...
4249,ZZZ,"Sleeping, Bored, Tired"
4250,\M/,Heavy Metal Music
4251,^5,High Five
4252,^RUP^,Read Up Please


In [43]:
slang_df = slang_df.drop_duplicates(subset=['slang']).reset_index(drop=True)

In [9]:
# slang_df.to_csv('slang_df.csv')

In [4]:
# slang_df = slang_df.drop(columns=['Unnamed: 0'])
# slang_df

Unnamed: 0,slang,meaning
0,-core,aesthetic
1,143,i love you
2,1437,i love you forever
3,14643,i will always love you
4,2,to
...,...,...
3961,^5,high five
3962,^rup^,read up please
3963,^urs,up yours
3964,yup,yes


In [44]:
slang_df = slang_df[slang_df['slang'].str.contains('cunt', na=False, case=False) == False]

In [45]:
slang_df = slang_df[slang_df['meaning'].str.contains('not online', na=False, case=False) == False]

In [46]:
slang_df = slang_df[slang_df['meaning'].str.contains('hey', na=False, case=False) == False]

In [49]:
new_slang = {'slang': 'x', 'meaning': 'kiss'}
new_slang_df = pd.DataFrame([new_slang])

In [50]:
new_slang = {'slang': 'idk', 'meaning': 'i do not know'}
new_slang_df = pd.DataFrame([new_slang])

In [52]:
new_slang = {'slang': 'yup', 'meaning': 'yes'}
new_slang_df = pd.DataFrame([new_slang])

In [53]:
slang_df = pd.concat([slang_df, new_slang_df], ignore_index=True)
slang_df

Unnamed: 0,slang,meaning
0,Meekd,Suppressed
1,LOL,Laughing out loud
2,mk,Okay
3,IRL,In real life
4,TTYL,Talk to you later
...,...,...
3992,^RUP^,Read Up Please
3993,^URS,Up Yours
3994,x,kiss
3995,idk,i do not know


In [54]:
slang_df = slang_df.drop_duplicates(subset='slang')

In [55]:
slang_df = slang_df[slang_df['slang'] != 'of']

In [105]:
# slang_df=slang_df.sort_values(by=['slang']).reset_index(drop=True)
# slang_df

In [56]:
slang_df['slang'] = slang_df['slang'].apply(lambda x: str(x).lower())
slang_df

Unnamed: 0,slang,meaning
0,meekd,Suppressed
1,lol,Laughing out loud
2,mk,Okay
3,irl,In real life
4,ttyl,Talk to you later
...,...,...
3992,^rup^,Read Up Please
3993,^urs,Up Yours
3994,x,kiss
3995,idk,i do not know


In [57]:
slang_df['meaning'] = slang_df['meaning'].apply(lambda x: str(x).lower())
slang_df

Unnamed: 0,slang,meaning
0,meekd,suppressed
1,lol,laughing out loud
2,mk,okay
3,irl,in real life
4,ttyl,talk to you later
...,...,...
3992,^rup^,read up please
3993,^urs,up yours
3994,x,kiss
3995,idk,i do not know


In [25]:
# slang_df['filter'] = slang_social['slang'].apply(lambda x: len(str(x).split()))
# slang_df

Unnamed: 0,slang,meaning,filter
0,meekd,suppressed,1.0
1,lol,laughing out loud,1.0
2,mk,okay,1.0
3,irl,in real life,1.0
4,ttyl,talk to you later,1.0
...,...,...,...
3992,^rup^,read up please,
3993,^urs,up yours,
3994,x,kiss,
3995,idk,i do not know,


In [58]:
slang_df['meaning'] = slang_df['meaning'].apply(lambda x: x.replace('f***', "fuck").replace('s***', "shit").replace('d***','damn').replace('a**','ass'))

In [59]:
slang_df.to_csv('slang_df_final.csv')

In [60]:
SLANG_SOCIAL = slang_df.set_index('slang').T.to_dict('records')[0]
len(SLANG_SOCIAL.keys())

  SLANG_SOCIAL = slang_df.set_index('slang').T.to_dict('records')[0]


3957

In [61]:
with open("SLANG_SOCIAL.pkl", "wb") as fw:
    pickle.dump(SLANG_SOCIAL, fw)

## Profane 

https://www.cs.cmu.edu/∼biglou/resources/bad-words.txt

https://code.google.com/archive/p/badwordslist/downloads

https://github.com/chucknorris-io/swear-words/tree/master?tab=readme-ov-file

In [28]:
# Combining the text from the two files into one, ensuring no duplicates and sorting the list alphabetically

file1_path = '/Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/bad-words.txt'
file2_path = '/Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/badwords.txt'

# Reading the contents of the first file
with open(file1_path, 'r', encoding='utf-8') as file:
    bad_words1 = file.read().split('\n')

# Reading the contents of the second file
with open(file2_path, 'r', encoding='utf-8') as file:
    bad_words2 = file.read().split('\n')

bad_words1_lower = [word.lower() for word in bad_words1]
bad_words2_lower = [word.lower() for word in bad_words2]
    
# Combining the lists, removing duplicates, and sorting
combined_bad_words = sorted(set(bad_words1_lower + bad_words2_lower))

# Saving the combined list to a new file
combined_file_path = '/Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/badwords_list.txt'
with open(combined_file_path, 'w', encoding='utf-8') as file:
    file.write('\n'.join(combined_bad_words))

combined_file_path

'/Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/badwords_list.txt'