# Basic Analysis

In [None]:
import numpy as np
import pandas as pd 

from sklearn import feature_extraction, linear_model, model_selection, preprocessing

import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train.head()

In [None]:
# Not a disaster tweet

train[train['target'] == 0]['text'].values[1]

In [None]:
# a disaster tweet

train[train['target'] == 1]['text'].values[1]

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

ex_train_vectors = count_vectorizer.fit_transform(train['text'][0:5])

In [None]:
print(ex_train_vectors[0].todense().shape)

In [None]:
train_vectors = count_vectorizer.fit_transform(train['text'])

test_vectors = count_vectorizer.transform(test['text'])

In [None]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.

clf = linear_model.RidgeClassifier()

In [None]:
scores = model_selection.cross_val_score(clf, train_vectors, train['target'], cv=3, scoring='f1')
scores

In [None]:
clf.fit(train_vectors, train['target'])

In [None]:
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
sample_submission['target'] = clf.predict(test_vectors)

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)

# Second Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')
plt.style.use('seaborn')

## Representing text as numerical data

In [None]:
# example text for model training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'Please call me... PLEASE!']

##### We will use CountVectorizer to "convert text into a matrix of token counts"

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [None]:
# learn the 'vocabulary' of the training data

vect.fit(simple_train)

In [None]:
# examine the fitted vocabulary

vect.get_feature_names()

### what is 'document-term matrix' 

- reference : https://wikidocs.net/24559, https://omicro03.medium.com/%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC-nlp-7%EC%9D%BC%EC%B0%A8-term-document-matrix-tdm-f959ce229ade
- A 'document-term matrix' is a matrix expressing the frequency of each word appearing in multiple documents.
- DTM is meaningful in that it can be quantified so that documents can be compared with each other. 

In [None]:
# transform training data into a 'document-term matrix'

simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

In [None]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

In [None]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
# check the type of the document-term matrix
type(simple_train_dtm)

In [None]:
# examine the sparse matrix contents
print(simple_train_dtm)

In [None]:
# example text for model testing
simple_test = ["please don't call me"]

##### In order to make a prediction, the new observation must have the same features as the training observations, both in number and meaning.

- so we must not do 'vect.fit(simple_test)'
- just do 'vect.transform(simple_test)'!!

In [None]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

In [None]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns = vect.get_feature_names())

> Summary:
- vect.fit(train) learns the vocabulary of the training data
- vect.transform(train) uses the fitted vocabulary to build a document-term matrix from the training data
- vect.transform(test) uses the fitted vocabulary to build a document-term matrix from the testing data **(and ignores tokens it hasn't seen before)**

## Reading a text-based dataset into pandas

In [None]:
tweets = pd.read_csv('../input/nlp-getting-started/train.csv')
tweets.dropna(how='any', inplace=True, axis=1)
tweets.head()

- The dropna function removes rows or columns with NaN values from the DataFrame.
- how = 'any' -> Drop even if there is only one NaN value in row or column (default!)


## Exploratory Data Analysis (EDA)

In [None]:
tweets.groupby('target').describe()

we have 4342 **Not a disaster** and 3271 **real a disaster**

In [None]:
tweets['text_len'] = tweets.text.apply(len)
tweets.head()

In [None]:
plt.figure(figsize=(12,8))

tweets[tweets.target == 1].text_len.plot(bins=30, kind='hist', color='blue', label='real a disaster', alpha=0.7)

tweets[tweets.target == 0].text_len.plot(bins=30,kind='hist', color='red', label='not a disaster', alpha=0.3)

plt.legend()
plt.xlabel('text length')

> There is no clear difference between 'real a disaster' and 'not a disaster' in 'text length'

In [None]:
tweets[tweets['target'] == 1].describe()

In [None]:
tweets[tweets['target'] == 0].describe()

## Text Pre-Processing

In [None]:
import string
from nltk.corpus import stopwords

def text_process(texts):
    
    STOPWORDS = stopwords.words('english')  + ['u', 'û', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    
    nopunc = [char for char in texts if char not in string.punctuation]
    
    nopunc = ''.join(nopunc)
    
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])



# Takes in a string of text, then performs the following:
# 1. Remove all punctuation 
# 2. Remove all stopwords
# 3. Returns a list of the cleaned text

In [None]:
tweets.head()

> Now let's "tokenize" these texts. Tokenization is just the term used to describe the process of converting the normal text strings into a list of tokens (words that we actually want).

In [None]:
tweets['clean_text'] = tweets.text.apply(text_process)

In [None]:
tweets.head()

In [None]:
type(stopwords.words('english'))

In [None]:
from collections import Counter

words = tweets[tweets.target == 0].clean_text.apply(lambda x: [word.lower() for word in x.split()])
not_disaster_words = Counter()

for msg in words:
    not_disaster_words.update(msg)
    
print(not_disaster_words.most_common(50))

In [None]:
from collections import Counter

words = tweets[tweets.target == 1].clean_text.apply(lambda x: [word.lower() for word in x.split()])
disaster_words = Counter()

for msg in words:
    disaster_words.update(msg)
    
print(disaster_words.most_common(50))

#### Display WordCloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

def displayWordCloud(data = None, backgroundcolor = 'white', width=800, height=600 ):
    wordcloud = WordCloud(
                        background_color = backgroundcolor, 
                        stopwords = stopwords.words('english')  + ['u', 'û', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'],
                        width = width, height = height).generate(data)
    plt.figure(figsize = (15 , 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show() 

In [None]:
# not_disaster_words_WORDCLOUD

%time displayWordCloud(' '.join(not_disaster_words))

In [None]:
# disaster_words_WORDCLOUD

%time displayWordCloud(' '.join(disaster_words))

## Vectorization

In [None]:
X = tweets.clean_text
y = tweets.target

print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(stop_words='english', ngram_range = (1, 2), max_df = 0.5, min_df = 2)


# vect = CountVectorizer()
vect.fit(X_train)

In [None]:
X_train_dtm = vect.transform(X_train)

In [None]:
# equivalently: combine fit and transform into a single step

X_train_dtm = vect.fit_transform(X_train)

In [None]:
X_train_dtm

In [None]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

> TF-IDF

- TF (term frequency) is a value that indicates how often a specific word appears in a document. The higher this value, the more important it can be considered in the document. However, if the word itself is used frequently within a document family, this means that the word appears frequently. This is called DF (document frequency), and the inverse of this value is called IDF (inverse document frequency). TF-IDF is the product of TF and IDF.

- The IDF value is determined according to the nature of the document group. For example, the word 'atom' does not appear in general documents, so the IDF value increases and it can become a key word for documents. Other words that can be subdivided and distinguished are given higher weight.

- The inverse document frequency (IDF) is a value that indicates how common a word appears throughout a set of documents. It can be obtained by dividing the total number of documents by the number of documents containing the word and then taking the log.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)

## Building and evaluating a model

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
%time nb.fit(X_train_dtm, y_train)

In [None]:
y_pred_class = nb.predict(X_test_dtm)

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
X_test.shape

In [None]:
# print text for false positives

# [Not a disaster tweet] incorrectly classifier
X_test[y_pred_class > y_test]

In [None]:
# print text for false negatives

# [real disaster tweet] incorrectly classifier
X_test[y_pred_class < y_test]

In [None]:
y_pred_prob = nb.predict_proba(X_test_dtm)[:,1]
y_pred_prob

In [None]:
metrics.roc_auc_score(y_test, y_pred_prob)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe1 = Pipeline([('bow', CountVectorizer(stop_words='english', ngram_range = (1, 2), max_df = 0.5, min_df = 2)),
                ('tfid', TfidfTransformer()),
                ('model', MultinomialNB())])
pipe1.fit(X_train, y_train)

In [None]:
y_pred = pipe1.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
metrics.confusion_matrix(y_test, y_pred)

##### REAL prediction1 & submission1

In [None]:
test = pd.read_csv('../input/nlp-getting-started/test.csv')
test['clean_text'] = test.text.apply(text_process)
test_clean_text = test['clean_text']

In [None]:
pipe1.fit(X, y)
prediction1 = pipe1.predict(test_clean_text)

In [None]:
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
sample_submission['target'] = prediction1

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)

## Comparing Models
> Multinomial Naive Bayes vs logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')

In [None]:
%time logreg.fit(X_train_dtm, y_train)

In [None]:
y_pred_class = logreg.predict(X_test_dtm)

In [None]:
y_pred_prob = logreg.predict_proba(X_test_dtm)[:,1]
y_pred_prob

In [None]:
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
metrics.roc_auc_score(y_test, y_pred_prob)

##### REAL prediction2 & submission

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe2 = Pipeline([('bow', CountVectorizer(stop_words='english', ngram_range = (1, 2), max_df = 0.5, min_df = 2)),
                ('tfid', TfidfTransformer()),
                ('model', LogisticRegression(solver='liblinear'))])

In [None]:
pipe2.fit(X, y)
prediction2 = pipe2.predict(test_clean_text)

In [None]:
sample_submission['target'] = prediction2

In [None]:
sample_submission.to_csv('submission.csv', index=False)

## Tuning the vectorizer

In [None]:
vect = CountVectorizer(stop_words='english')

In [None]:
vect = CountVectorizer(ngram_range = (1, 2))

In [None]:
# ignore terms that appear in more than 50% of the documents
vect = CountVectorizer(max_df = 0.5)

In [None]:
# only keep terms that appear in at least 2 documents
vect = CountVectorizer(min_df=2)