In [None]:
"""Columns
id - a unique identifier for each tweet
text - the text of the tweet
location - the location the tweet was sent from (may be blank)
keyword - a particular keyword from the tweet (may be blank)
target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)"""

In [None]:
#Importing all libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from wordcloud import WordCloud
import missingno


%matplotlib inline
import random
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB,CategoricalNB
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer


from sklearn import preprocessing
from sklearn.manifold import TSNE
import seaborn as sns
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression




from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from time import time
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test= pd.read_csv("../input/nlp-getting-started/test.csv")

In [None]:
#No of rows and columns
train.shape

In [None]:
#Prints first 5 rows
train.head(5)

In [None]:
train["location"].value_counts()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
#checking for null values
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
#Remove redundant samples
train=train.drop_duplicates(subset=['text', 'target'], keep='first')
train.shape

In [None]:
#plot graph for missing values

missingno.matrix(train, figsize = (5,5))

In [None]:
#location is having most number of null values

In [None]:
train.isna().sum().plot(kind="bar")
plt.title("no of null values in train data")
plt.show()

In [None]:
#Location with maximum null values
#Keyword follows Location with null values

In [None]:
keywords_vc = pd.DataFrame({"Count": train["keyword"].value_counts()})
sns.barplot(y=keywords_vc[0:30].index, x=keywords_vc[0:30]["Count"], orient='h')
plt.title("Top 30 Keywords")
plt.show()

In [None]:
disaster_keywords = train.loc[train["target"] == 1]["keyword"].value_counts()
nondisaster_keywords = train.loc[train["target"] == 0]["keyword"].value_counts()

fig, ax = plt.subplots(1,2, figsize=(20,8))
sns.barplot(y=disaster_keywords[0:30].index, x=disaster_keywords[0:30], orient='h', ax=ax[0], palette="Reds_d")
sns.barplot(y=nondisaster_keywords[0:30].index, x=nondisaster_keywords[0:30], orient='h', ax=ax[1], palette="Blues_d")
ax[0].set_title("Top 30 Keywords - Disaster Tweets")
ax[0].set_xlabel("Keyword Frequency")
ax[1].set_title("Top 30 Keywords - Non-Disaster Tweets")
ax[1].set_xlabel("Keyword Frequency")
plt.tight_layout()
plt.show()

In [None]:
# drop location and keyword column
train = train.drop(['location','keyword'],axis=1)
test = test.drop(['location','keyword'],axis=1)

In [None]:
train.head(5)

In [None]:
#Let check how many real tweets and fake tweets

tweetreal = len(train[train["target"]==1])

In [None]:
#percentage of real tweets
RealTweetPercentage = tweetreal/train.shape[0]*100
RealTweetPercentage

In [None]:
#Percentage of fake tweet
FakeTweetPercentage = 100-RealTweetPercentage
FakeTweetPercentage

In [None]:
#plot target variables
sns.countplot(x ='target', data= train)

In [None]:
#Now lets understand the density of tweets in both test and train dataset.

den_train = train['text'].str.len()
den_test = test['text'].str.len()

plt.hist(den_train, label = "train_tweets")
plt.hist(den_test, label= "text_tweets")

In [None]:
#So here train data is having more tweets compared to the test data.

In [None]:
#Fetch wordcount for each abstract
train['word_count'] = train['text'].apply(lambda x: len(str(x).split(" ")))
train[['text','word_count']].head(10)

In [None]:
##Descriptive statistics of word counts
train.word_count.describe()

#The average word count is about 15 words per abstract. The word count ranges from a minimum of 1 to a maximum of 54.

In [None]:
#Identify common words: Its the frequently used words as well as it could be potential data specific stop words.
import pandas as pd

freq = pd.Series(' '.join(train['text']).split()).value_counts()[:20]
freq

In [None]:
#Identify uncommon words: Uncommon words in the train dataset
freq1 =  pd.Series(' '.join(train ['text']).split()).value_counts()[-20:]
freq1

In [None]:
#Disaster tweet

disaster_tweets = train[train['target'] ==1 ]['text']
for i in range(1,10):
    print(disaster_tweets[i])

In [None]:
# non-disaster tweets
non_disaster_tweets = train[train['target'] !=1 ]['text']
non_disaster_tweets

In [None]:
#wordcloud

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[20, 5])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(disaster_tweets))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Disaster Tweets',fontsize=20);


wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(non_disaster_tweets))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Non Disaster Tweets',fontsize=20);


In [None]:
train.head(5)

In [None]:
import nltk
nltk.download('stopwords')

Cleaning

Now lets do **Text-preprocessing**:
1. Reduce sparsity
2. Text clean-up
3. Shrinkage the vocabulary to retain only the relevant words

Text preprocessing
1: Noise Removal

      a: Removing redundant text components

      b: Punctuations, Tags, URL, stopwords

2: Normalization

      a: Stemming - Remove suffixes
      b: Lemmatization- Works based on the roots of the word 






In [None]:
#Normalization is the method of handling multiple occurances of the same word
#Stemming normalizes text by removing suffixes.
#Lemmatisation is a more advanced technique which works based on the root of the word.


We have 92 redundants sapmles in our dataset (7613-7521)=92

---



In [None]:
#After removing all the redundant values lets check te counts
train.target.value_counts()

In [None]:
stopwords.words('english')

In [None]:
#List of punctuations and we will remove them from our corpus
import string
string.punctuation

In [None]:
#Cleaning data

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
train['text'] = train['text'].apply(lambda x: clean_text(x))
test['text'] = test['text'].apply(lambda x: clean_text(x))

In [None]:
train['text'].head()

In [None]:
tweets = train["text"]

In [None]:
#Tokenize


tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
train['text'] = train['text'].apply(lambda x:tokenizer.tokenize(x))
test['text'] = test['text'].apply(lambda x:tokenizer.tokenize(x))
train['text'].head()

In [None]:
#Removing stopwords


def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words 
train['text'] = train['text'].apply(lambda x : remove_stopwords(x))
test['text'] = test['text'].apply(lambda x : remove_stopwords(x))
test.head()

In [None]:
disaster_tweets = train[train['target'] ==1 ]['text']
for i in range(1,10):
    print(disaster_tweets[i])

In [None]:
# non-disaster tweets
non_disaster_tweets = train[train['target'] !=1 ]['text']
non_disaster_tweets.head()

In [None]:
#Lemmatization

nltk.download()
import nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
# lemmatization
lem = WordNetLemmatizer()
def lem_word(x):
    return [lem.lemmatize(w) for w in x]

    

In [None]:
train['text'] = train['text'].apply(lem_word)
test['text'] = test['text'].apply(lem_word)

In [None]:
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

train['text'] = train['text'].apply(lambda x : combine_text(x))
test['text'] = test['text'].apply(lambda x : combine_text(x))
train['text']
train.head()

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
stem = PorterStemmer()

nltk.download()

In [None]:
count_vectorizer = CountVectorizer()
train_vector = count_vectorizer.fit_transform(train['text'])
test_vector = count_vectorizer.transform(test['text'])
print(train_vector[0].todense())

In [None]:
#TF IDF

tfidf = TfidfVectorizer(min_df = 2,max_df = 0.5,ngram_range = (1,2))
train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test['text'])



In [None]:
test_tfidf

In [None]:
mnb = MultinomialNB(alpha = 2.0)
scores_vector = model_selection.cross_val_score(mnb,train_vector,train['target'],cv = 10,scoring = 'f1')
print("score:",scores_vector)
scores_tfidf = model_selection.cross_val_score(mnb,train_tfidf,train['target'],cv = 10,scoring = 'f1')
print("score of tfidf:",scores_tfidf)

In [None]:
mnb.get_params()

In [None]:
#Logistic Regression

In [None]:
lg = LogisticRegression(C = 1.0)
scores_vector = model_selection.cross_val_score(lg, train_vector, train["target"], cv = 5, scoring = "f1")
print("score:",scores_vector)
scores_tfidf = model_selection.cross_val_score(lg, train_tfidf, train["target"], cv = 5, scoring = "f1")
print("score of tfidf:",scores_tfidf)

In [None]:
lg.get_params()

In [None]:
mnb.fit(train_tfidf, train["target"])
y_pred = mnb.predict(test_tfidf)

In [None]:
y_pred

Submission

In [None]:
submission_file = pd.DataFrame({'Id':test['id'],'target':y_pred})

In [None]:
submission_file.to_csv('submission_file.csv',index=False)

In [None]:
submission_file = pd.read_csv('submission_file.csv')

In [None]:
submission_file.head(10)

Text preparation

In [None]:
"""Text in the corpus needs to be converted to a format that can be interpreted by the machine learning algorithms. There are 2 parts of this conversion — Tokenisation and Vectorisation."""

In [None]:
"""Tokenisation is the process of converting the continuous text into a list of words. The list of words is then converted to a matrix of integers by the process of vectorisation. Vectorisation is also called feature extraction."""

In [None]:
"""For text preparation we use the bag of words model which ignores the sequence of the words and only considers word frequencies."""