In [3]:
# Import Basis Libraries    
import pandas as pd
import os


In [4]:
# Read text
train_set = pd.read_csv(os.path.join('text', 'train.csv'))
test_set = pd.read_csv(os.path.join('text','test.csv'))

In [5]:
train_set

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [6]:
## 填充keyword和location
train_set.keyword.fillna("unknown", inplace=True)
test_set.keyword.fillna("unknown", inplace=True)

In [7]:
train_set.location.fillna("UNK", inplace=True)
test_set.location.fillna("UNK", inplace=True)

In [8]:
# 将text全部小写 使用bert其实不需要这个步骤， 有对大小写不敏感的预训练模型
train_set.text = train_set.text.str.lower()
test_set.text = test_set.text.str.lower()

In [9]:
# 删除HTML标签
# 导入正则表达式包
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [10]:
train_set.text = train_set.text.apply(remove_html_tags)
test_set.text = test_set.text.apply(remove_html_tags)

In [11]:
# 删除网页链接
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [12]:
train_set.text = train_set.text.apply(remove_url)
test_set.text = test_set.text.apply(remove_url)

In [13]:
# 删除标点符号
import string

# Storing Punctuation in a Variable
punc = string.punctuation

# The code defines a function, remove_punc1, that takes a text input and removes all punctuation characters from it using
# the translate method with a translation table created by str.maketrans. This function effectively cleanses the text of punctuation symbols.
def remove_punc(text):
    return text.translate(str.maketrans('', '', punc))



In [14]:
train_set.text = train_set.text.apply(remove_punc)
test_set.text = test_set.text.apply(remove_punc)

In [15]:
train_set

Unnamed: 0,id,keyword,location,text,target
0,1,unknown,UNK,our deeds are the reason of this earthquake ma...,1
1,4,unknown,UNK,forest fire near la ronge sask canada,1
2,5,unknown,UNK,all residents asked to shelter in place are be...,1
3,6,unknown,UNK,13000 people receive wildfires evacuation orde...,1
4,7,unknown,UNK,just got sent this photo from ruby alaska as s...,1
...,...,...,...,...,...
7608,10869,unknown,UNK,two giant cranes holding a bridge collapse int...,1
7609,10870,unknown,UNK,ariaahrary thetawniest the out of control wild...,1
7610,10871,unknown,UNK,m194 0104 utc5km s of volcano hawaii,1
7611,10872,unknown,UNK,police investigating after an ebike collided w...,1


#### 重要内容 Handling ChatWords

In [16]:
# Here Come ChatWords Which i Get from a Github Repository
# 英文网络常见的缩写
# Repository Link : https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [17]:
# 将文本先分词，然后把简写映射到全称，然后拼接成文本
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

In [18]:
train_set.text = train_set.text.apply(chat_conversion)
test_set.text = test_set.text.apply(chat_conversion)

### 修正语法错误

In [19]:
from textblob import TextBlob

In [20]:
incorrect_text = 'ceeeeerain conditionas duriing seveal ggenerations aree moodified in the saame maner.'
print(incorrect_text)
# Text 2 
incorrect_text2 = 'The cat sat on the cuchion. while plyaiing'
# Calling function
textBlb = TextBlob(incorrect_text)
textBlb1 = TextBlob(incorrect_text2)
# Corrected Text
print(type(textBlb.correct().string))
print(incorrect_text2)
print(textBlb1.correct().string)

ceeeeerain conditionas duriing seveal ggenerations aree moodified in the saame maner.
<class 'str'>
The cat sat on the cuchion. while plyaiing
The cat sat on the cushion. while playing


In [23]:
def text_correct(text):
    return TextBlob(text).correct().string

In [25]:
train_set.text = train_set.text.apply(text_correct)
test_set.text = test_set.text.apply(text_correct)

KeyboardInterrupt: 