## Step 1. Frame The Problem

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install nltk
!pip install konlpy




## Step 2. Import Necessary Datasets

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

import string
import nltk
import re # <- 정규 표현식 re.compile 사용을 위해 라이브러리 사용
from nltk.corpus import stopwords # I, AM, IS, NOT, AN, A 등 수량이나 중요하지 않은 문법적 용어 제거
from nltk.tokenize import word_tokenize # 영어 문장을 단어별로 나눠주기 위해 사용 
from nltk.stem import PorterStemmer # 단어를 원형으로 표현하기 위해, Eating -> eat, ate -> eat 바꿔줌

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tempe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tempe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [7]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [8]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [9]:
train = train.drop(columns = ['keyword','location'])
train.dtypes

id         int64
text      object
target     int64
dtype: object

In [10]:
train

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,Police investigating after an e-bike collided ...,1


### Analyse the Data

In [11]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7613 non-null   int64 
 1   text    7613 non-null   object
 2   target  7613 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 178.6+ KB


In [13]:
train.isnull().sum()

id        0
text      0
target    0
dtype: int64

In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [15]:
test.sample(10)

Unnamed: 0,id,keyword,location,text
1853,6250,hijacking,Georgia,#hot Funtenna: hijacking computers to send da...
529,1734,buildings%20burning,,forestservice : RT dhsscitech: #Firefighters r...
2571,8573,screams,,@melodores @Hozier *SCREAMS*
119,387,annihilation,,Stop the Annihilation of the Salt River Wild H...
1412,4651,engulfed,"Hagerstown, MD 21742",Why are you engulfed by low self-image? Take t...
208,676,attack,,#People #Tilly the #Confused Cat Overcomes Hor...
2451,8192,riot,"Portland, OR",@CHold ironically RSL call their stadium the Riot
2145,7182,mudslide,St Albans,@curryspcworld looks like a mudslide. Unreal s...
2482,8293,rubble,London,Outrage as dog found buried alive in rubble ht...
77,257,ambulance,Happily Married with 2 kids,AMBULANCE SPRINTER AUTOMATIC FRONTLINE VEHICLE...


In [16]:
test.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [17]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [18]:
print(f"Tweets that does not contain information about disaster :\n\n {train.loc[train['target']==0,'text'][0:15].values}", end="\n")

print()

print(f"Tweets that contains information about disaster :\n\n {train.loc[train['target']==1,'text'][0:15].values}", end="\n")

Tweets that does not contain information about disaster :

 ["What's up man?" 'I love fruits' 'Summer is lovely' 'My car is so fast'
 'What a goooooooaaaaaal!!!!!!' 'this is ridiculous....'
 'London is cool ;)' 'Love skiing' 'What a wonderful day!' 'LOOOOOOL'
 "No way...I can't eat that shit" 'Was in NYC last week!'
 'Love my girlfriend' 'Cooool :)' 'Do you like pasta?']

Tweets that contains information about disaster :

 ['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'
 'Forest fire near La Ronge Sask. Canada'
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
 '13,000 people receive #wildfires evacuation orders in California '
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '
 '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires'
 '#flood #disaster Heavy rain causes flash flo

### Working with Columns

In [19]:
def preprocessing(text):
    text = text.lower() # <- 단어를 소문자로
    pattern = re.compile('[^a-z]') # 특수문자&숫자 삭제
    words = nltk.word_tokenize(text) # 
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = [PorterStemmer().stem(word) for word in words if word.lower() not in stop_words] 
    preprocessed_text = ' '.join(words)
    return preprocessed_text

train

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,Police investigating after an e-bike collided ...,1


In [20]:
train['text_cleaned']=train['text'].apply(preprocessing) # aopply를 사용이 가능한 것은 train이 지금 dataframe 객체이기 때문에
test['text']=test['text'].apply(preprocessing)

In [21]:
train

Unnamed: 0,id,text,target,text_cleaned
0,1,Our Deeds are the Reason of this #earthquake M...,1,deed reason # earthquak may allah forgiv us
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask . canada
2,5,All residents asked to 'shelter in place' are ...,1,resid ask 'shelter place ' notifi offic . evac...
3,6,"13,000 people receive #wildfires evacuation or...",1,"13,000 peopl receiv # wildfir evacu order cali..."
4,7,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi # alaska smoke # wildfir p...
...,...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1,two giant crane hold bridg collaps nearbi home...
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1,@ aria_ahrari @ thetawniest control wild fire ...
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m1.94 [ 01:04 utc ] ? 5km volcano hawaii . htt...
7611,10872,Police investigating after an e-bike collided ...,1,polic investig e-bik collid car littl portug ....


In [22]:
x=train['text_cleaned'].values
y=train['target'].values

## Step 4. Feature Engineering

TFidVectorizer를 사용할텐데 CountVectorizer를 통해 자연어를 벡터화하는 경우 발생할 수 있는 문제점(의미 없이 자주 사용되는 단어의 가중치의 증가 등)을 해결하기 위한 방법 중 하나가 TfidfVectorizer다.

먼저 tf와 idf를 이해해야하는데, tf(Term Frequencey)로써 하나의 문서(문장)에서 특정 단어가 등장하는 횟수이며, 

idf(Inverse Document Frequency) 혹은 df(Document Frequency)는 문서 빈도, 특정 단어가 몇 개의 문서에서 등장하는지를 수치화 한 것, 그것의 역수는 idf, 

역수를 사용하는 이유는, 적은 문서(문장)에 등장할수록 큰 숫자가 되게하고, 반대로 많은 문서에 등장할수록 숫자를 작아지게 함으로써 여러 문서(문장)에 의미 없이 사용되는 단어의 가중치를 줄이기 위해서

In [23]:
# Vectorizing the data using TF-IDF Vectorizer
classifier=TfidfVectorizer()
x=classifier.fit_transform(x)

왜 fit_transform()은 training data에서만 사용할까? & 왜 transform()은 test data에서만 사용할까?

fit()<- 은 데이터를 학습시키는 메서드이고, transform()은 실제로 학습시킨 것을 적용하는 매서드임

테스트 데이터에 fit_transform()을 사용할 경우, 트레인 데이터에서 배운 특징들을 다 무시하고, 테스트 데이터에서 새로운 mean 과 variance를 학습해버림

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.10,random_state=27,stratify=y)

## Step 5. Model Selection

In [25]:
logReg=LogisticRegression(penalty='l2')
logReg.fit(x_train,y_train)

In [26]:
R=logReg.predict(x_train)
accuracy_score(y_train,R)

0.8851262589403007

In [27]:
from sklearn.svm import SVC
svc_model=SVC()
svc_model.fit(x_train,y_train)