In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("nlp-getting-started/train.csv", encoding="utf-8")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train_data.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [4]:
train_data.describe(include=np.object_)

Unnamed: 0,keyword,location,text
count,7552,5080,7613
unique,221,3341,7503
top,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...
freq,45,104,10


In [5]:
train_data["location"].value_counts()

USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: location, Length: 3341, dtype: int64

In [6]:
train_data["text"].value_counts()

11-Year-Old Boy Charged With Manslaughter of Toddler: Report: An 11-year-old boy has been charged with manslaughter over the fatal sh...        10
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam                      6
The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'                               6
#Bestnaijamade: 16yr old PKK suicide bomber who detonated bomb in ... http://t.co/KSAwlYuX02 bestnaijamade bestnaijamade bestnaijamade beÛ_     6
Madhya Pradesh Train Derailment: Village Youth Saved Many Lives                                                                                  5
                                                                                                                                                ..
Escape The Heat (and the #ORShow) for a trail run on Desolation Loop you'll be glad you did http://t.co/n2ucNzh38P htt

In [7]:
# random 5 tweets 
train_data["text"].sample(5, random_state=1234)

5953         @camilacabello97 NOW IM INTERNALLY SCREAMING
692     S3XLEAK!!!\nPh0tos of 19yrs old Ash@wo lady in...
6342    NTSB: Virgin Galactic's SpaceshipTwo crash due...
1019    Micom 2015 Summer Contrast Candy Color Bowknot...
5254    SYD traffic HAZARD Oil spill - BANKSTOWN Stace...
Name: text, dtype: object

In [31]:
import re

def get_url_num(text):
    return len(re.findall(r'(https?://[^\s]+)', text))

def get_hash_num(text):
    return len(re.findall(r"#[a-zA-Z0-9]+", text.lower()))

def get_mentions_num(text):
    return len(re.findall(r'@[A-Za-z0-9_]+', text))

def get_words_num(text):
    return len(re.findall(r'[A-Za-z_]+', text))

def get_digits_num(text):
    return sum(c.isdigit() for c in text)

def get_characters_num(text):
    return sum(c.isalpha() for c in text)

def encode_html(text):
    if not pd.isna(text):
        return re.sub(r"%20", " ", text)
    return text

def remove_trash(text):
    # text = re.sub(r"#[a-zA-Z0-9]+", "", text)
    text = re.sub(r"#", "", text)

    text = re.sub(r"@[A-Za-z0-9_]+", "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"[0-9]", "", text)
    text = re.sub(r'(https?://[^\s]+)', "", text)
    
    return text.lower()

In [32]:
train_data["url_count"] = train_data["text"].apply(get_url_num)
train_data["mentions_count"] = train_data["text"].apply(get_mentions_num)
train_data["hash_count"] = train_data["text"].apply(get_hash_num)
train_data["words_count"] = train_data["text"].apply(get_words_num)
train_data["digits_count"] = train_data["text"].apply(get_digits_num)
train_data["characters_count"] = train_data["text"].apply(get_characters_num)

train_data["text"] = train_data["text"].apply(remove_trash)
train_data["text"] = train_data["text"].apply(encode_html)


train_data["keyword"] = train_data["keyword"].apply(encode_html)

In [10]:
import geocoder
import spacy

nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [51]:
i = 0

def encode_location(location):
    global i

    if not pd.isna(location):
        gg = geocoder.osm(location)
        i += 1
        
        try:
            country = gg.osm["addr:country"]
        except Exception:
            code = "other"
        else:
            code = country

        with open("counries.txt", "a") as f:
            f.write(f"{i};{code}\n")
        print(i, end="\r")
        
    return location

In [52]:
train_data["location"].apply(encode_location)

2

KeyboardInterrupt: 

In [46]:
from collections import Counter

def keep_k_most_freq(text, k=5):
    doc = nlp(text)
    words = [token.lemma_ for token in doc if token.is_alpha]
    word_freq = Counter(words)
    return " ".join([ms[0].lower() for ms in word_freq.most_common(k)])

In [47]:
train_data["text"] = train_data["text"].apply(keep_k_most_freq)

In [48]:
from transformers import pipeline

train_data["text"]

0                         our deed be the reason
1                      forest fire near la ronge
2                        be shelter in place all
3       people receive wildfire evacuation order
4                        from just got send this
                          ...                   
7608                      two giant crane hold a
7609                       the of in out control
7610                         s of volcano hawaii
7611             e bike police investigate after
7612                     the late more home raze
Name: text, Length: 7613, dtype: object