In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv',index_col=0)
test = pd.read_csv('../input/nlp-getting-started/test.csv',index_col=0)


In [None]:
train.head(5)

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
train.drop(['keyword', 'location'],axis=1,inplace=True)
test.drop(['keyword', 'location'],axis=1,inplace=True)
train.head(5)

In [None]:
# check duplicates
train.duplicated().sum()

In [None]:
# remove duplicates
train.drop_duplicates(inplace=True)

train.info()

In [None]:
train.shape

In [None]:
#import library
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [None]:
plt.bar(['Disaster','NotDisaster'],[(train.target==1).sum(),(train.target==0).sum()])
plt.show()

In [None]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Replace email address with 'emailaddress'
train['text'] = train['text'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')

# Replace urls with 'webaddress'
train['text'] = train['text'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

# Replace money symbol with 'money-symbol'
train['text'] = train['text'].str.replace(r'£|\$', 'money-symbol')

# Replace 10 digit phone number with 'phone-number'
train['text'] = train['text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone-number')

# Replace normal number with 'number'
train['text'] = train['text'].str.replace(r'\d+(\.\d+)?', 'number')

# remove punctuation
train['text'] = train['text'].str.replace(r'[^\w\d\s]', ' ')

# remove whitespace between terms with single space
train['text'] = train['text'].str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
train['text'] = train['text'].str.replace(r'^\s+|\s*?$', ' ')

# change words to lower case
train['text'] = train['text'].str.lower()

In [None]:
nltk.download('popular')
stop_words = set(stopwords.words('english'))
train['text']  = train['text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [None]:
from nltk.stem import PorterStemmer , LancasterStemmer
ss = nltk.SnowballStemmer("english")
train['text'] = train['text'].apply(lambda x: ' '.join(ss.stem(term) for term in x.split()))

In [None]:
def dictionary(check):
    check = check.str.extractall('([a-zA_Z]+)')
    check.columns = ['check']
    b = check.reset_index(drop=True)
    check = b['check'].value_counts()
    
    dictionary = pd.DataFrame({'word': check.index, 'freq': check.values})
    dictionary.index = dictionary['word']
    dictionary.drop('word', axis = 1, inplace=True)
    dictionary.sort_values('freq', inplace= True, ascending= False)
    
    return dictionary

dictionary_clean = dictionary(train['text'])
dictionary_clean[:20].plot(kind = 'barh',figsize = (10,10))

In [None]:
pd.DataFrame(train['target'].value_counts()/train.shape[0]*100).round(2)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold,GridSearchCV
X_train, X_test, y_train, y_test  = train_test_split(train.text,
                                                          train.target, test_size=0.3, 
                                                          stratify=train.target,
                                                          random_state = 1672)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', stop_words='english', token_pattern=r'\w{1,}')
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_test)
test = tfidf.transform(test.text)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

In [None]:
clf = MultinomialNB(alpha=1)
scores = cross_val_score(clf, train_tfidf, y_train, cv=5, scoring="f1")
scores

In [None]:
clf.fit(train_tfidf, y_train)

In [None]:
f1_score(y_test, clf.predict(test_tfidf))

In [None]:
clf.predict(test)

In [None]:
pd.read_csv('../input/nlp-getting-started/test.csv').id.values

In [None]:
submission = pd.DataFrame({'id': pd.read_csv('../input/nlp-getting-started/test.csv').id.values,
                           'target': clf.predict(test)})

In [None]:
submission.to_csv('submission.csv', index=False)