In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


## **load data**

In [73]:
%%time
import numpy as np, pandas as pd, re, nltk, spacy, tqdm
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
tqdm.tqdm.pandas()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CPU times: user 729 ms, sys: 32 ms, total: 761 ms
Wall time: 759 ms


In [93]:
data_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
data_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
data_train.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


In [94]:
data_test.head(2)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."


In [95]:
print(f"{data_train.shape}\n{data_test.shape}")

(7613, 5)
(3263, 4)


In [96]:
print(
    f"TRAINING:\n{data_train['keyword'].isna().sum()}\n{data_train['location'].isna().sum()}\n{data_train['text'].isna().sum()}\n"
    f"TESTING:\n{data_test['keyword'].isna().sum()}\n{data_test['location'].isna().sum()}\n{data_test['text'].isna().sum()}"
) 

TRAINING:
61
2533
0
TESTING:
26
1105
0


In [97]:
for col in data_train.columns.to_list():
    print(data_train[col].dtype, sep='\n')

int64
object
object
object
int64


In [98]:
data_train = data_train.dropna()

In [99]:
keywords = list()
for item in data_train['keyword']:
    keywords.append(item)
keywords = set(keywords)
print(len(keywords))

221


## **work with text**

In [100]:
def clear_text(text):
    text = re.sub(r'<.*?>', '', text) #for html tags
    text = re.sub(r'https\S+|www\.\S+', '', text) #for urls
    text = re.sub(r'S+@S+', '', text) ## emails
    text = re.sub(r'[^\w\s]', '', text) #for punks and special symbols
    text = text.lower()
    return text

In [101]:
data_train['clear_text'] = data_train['text'].progress_apply(clear_text)
data_test['clear_text'] = data_test['text'].progress_apply(clear_text)

100%|██████████| 5080/5080 [00:00<00:00, 86151.80it/s]
100%|██████████| 3263/3263 [00:00<00:00, 94773.24it/s]


In [102]:
data_train.head(2)

Unnamed: 0,id,keyword,location,text,target,clear_text
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,bbcmtd wholesale markets ablaze httptcolhyxeohy6c
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,we always try to bring the heavy metal rt http...


In [103]:
def preprocessing_text(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    text = [w.lemma_ for w in nlp(text)]
    text = [w for w in text if w not in stop_words]
    return text

In [104]:
data_train['tokenz'] = data_train['clear_text'].progress_apply(preprocessing_text)
data_test['tokenz'] = data_test['clear_text'].progress_apply(preprocessing_text)

100%|██████████| 5080/5080 [00:47<00:00, 106.44it/s]
100%|██████████| 3263/3263 [00:31<00:00, 104.25it/s]


In [105]:
data_train.head(2)

Unnamed: 0,id,keyword,location,text,target,clear_text,tokenz
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,bbcmtd wholesale markets ablaze httptcolhyxeohy6c,"[bbcmtd, wholesale, market, ablaze, httptcolhy..."
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,we always try to bring the heavy metal rt http...,"[always, try, bring, heavy, metal, rt, httptco..."


In [106]:
embedding_model = Word2Vec(
    data_train['tokenz'],
    vector_size=300,
    window=5,
    min_count=2,
    workers=4
)

In [107]:
def embedding_processing(tokenz):
    embeddings = [embedding_model.wv[w] for w in tokenz if w in embedding_model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(embedding_model.vector_size)

In [108]:
data_train['embeddings'] = data_train['tokenz'].progress_apply(embedding_processing)
data_test['embeddings'] = data_test['tokenz'].progress_apply(embedding_processing)

100%|██████████| 5080/5080 [00:00<00:00, 21425.53it/s]
100%|██████████| 3263/3263 [00:00<00:00, 24192.46it/s]


In [109]:
data_train.head(2)

Unnamed: 0,id,keyword,location,text,target,clear_text,tokenz,embeddings
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1,bbcmtd wholesale markets ablaze httptcolhyxeohy6c,"[bbcmtd, wholesale, market, ablaze, httptcolhy...","[-0.028690403, 0.079627074, 0.028395547, 0.030..."
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,we always try to bring the heavy metal rt http...,"[always, try, bring, heavy, metal, rt, httptco...","[-0.049226772, 0.13793224, 0.050026227, 0.0508..."


In [110]:
X_train, y_train = np.stack(data_train['embeddings'].values), data_train['target'].values
X_test = np.stack(data_test['embeddings'].values)

In [111]:
reg = LogisticRegression()
reg.fit(X_train, y_train)

In [112]:
y_pred = reg.predict(X_test)

In [113]:
submission = pd.DataFrame({
    'id' : data_test['id'].values,
    'target' : y_pred
})
submission.to_csv('submission.csv', index=False)