**Real or Not? NLP with Disaster Tweets**<br>
https://www.kaggle.com/c/nlp-getting-started

In [29]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import numba
import os

**1. loading data**

In [34]:
train = pd.read_csv('rawdata/train.csv')
test = pd.read_csv('rawdata/test.csv')


df = train
print("===== training data =====")
print(df)
print(df.isna().sum()/len(df)*100)
print()

df = test
print("===== test data =====")
print(df)
print(df.isna().sum()/len(df)*100)

===== training data =====
         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...      

欠損値が多すぎて, locationが現時点で使えなさそう.

**TF-IDF and Logistic Regression**

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

features = "text"
target = "target"

# prepare vectorizer
vectorizer = TfidfVectorizer() #text => vector
vectorizer.fit(train[features])

X_train = vectorizer.transform(train[features])
X_train = X_train.toarray()
Y_train = train[target]

# train model
model = LogisticRegression()
model.fit(X_train,Y_train)

# predict
data = pd.concat([train,test],axis=0)
X = vectorizer.transform(data[features]).toarray()
pred = model.predict(X)

# concat to submit file
data["target"] = pred

# load submission file
submission = pd.read_csv(os.path.join("rawdata","sample_submission.csv"))

# my submission file
data = data.set_index(data["id"])
data = data.sort_index()

mysub = data.iloc[submission["id"]]
mysub[["id","target"]].to_csv(os.path.join("rawdata","my_submission.csv"),index=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




**TF-IDF and XGboost Classifier**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

features = "text"
target = "target"

# prepare vectorizer
vectorizer = TfidfVectorizer() #text => vector
vectorizer.fit(train[features])

X_train = vectorizer.transform(train[features])
X_train = X_train.toarray()
Y_train = train[target]

# train model
model = xgb.XGBClassifier()
model.fit(X_train,Y_train)

# predict
data = pd.concat([train,test],axis=0)
X = vectorizer.transform(data[features]).toarray()
pred = model.predict(X)

# concat to submit file
data["target"] = pred

# load submission file
submission = pd.read_csv(os.path.join("rawdata","sample_submission.csv"))

# my submission file
data = data.set_index(data["id"])
data = data.sort_index()

mysub = data.iloc[submission["id"]]
mysub[["id","target"]].to_csv(os.path.join("rawdata","my_submission.csv"),index=False)

**3. Clean Data**  - So we have a baseline score of 79% to work with , let's get to clean data and see if we can improve the score

As first step in cleaning - let us replace some commonly occuring shorthands 

In [21]:
def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"i'll", "i will", text)
    text = re.sub(r"she'll", "she will", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"here's", "here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    return text


df_train['clean_text'] = df_train['text'].apply(clean_text)
df_test['clean_text'] = df_test['text'].apply(clean_text)