In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
import unidecode
import nltk

from tensorflow import keras
from keras.preprocessing.text import text_to_word_sequence
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [None]:
path = '../input/nlp-getting-started/'

#Data files
train_data_file = path + 'train.csv'
test_data_file = path + 'test.csv'


dataset = pd.read_csv(train_data_file)
dataset.head()

test_set = pd.read_csv(test_data_file)


In [None]:
target_category = dataset["target"].unique()
target_category=list(map(str,target_category))
print(target_category)



In [None]:
print(len(dataset))
print(len(test_set))

In [None]:
dataset = dataset[['text','target']]
dataset.head()

In [None]:
dataset.groupby("target").target.count().plot.bar(ylim=0)

In [None]:
text = dataset["text"]
text.head()

In [None]:
print(text.iloc[31])

**Text Preprocessing**

In [None]:
def processing(text): 
    
    #for row in text:
    
#tokenization using keras text to word sequence tokenizer
    tokenized_text = text_to_word_sequence(text)
    #print(tokenized_text)
        
#stop word removal using remove_stopwords from gensim
    text = ' '.join(tokenized_text)
    text = text.replace("'", "")
    stop_word_removed_text = remove_stopwords(text)
    #print(stop_word_removed_text)
        
#remove numbers
    number_removed_text = new_string = ''.join(filter(lambda x: not x.isdigit(), stop_word_removed_text))
    #print(number_removed_text)
        
#remove extra white spaces
    extra_whitespace_removed = word_tokenize(number_removed_text)
    extra_whitespace_removed = number_removed_text.split()
    #print(extra_whitespace_removed)
        
    extra_whitespace_removed = ' '.join(extra_whitespace_removed)
    #print(extra_whitespace_removed)
        
#Convert Accented Characters(û -> u)
    accented_removed_text = unidecode.unidecode(extra_whitespace_removed)
    #print(accented_removed_text)
        
#lemmatization
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lem_input = nltk.word_tokenize(accented_removed_text)
    lem_text= ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in lem_input])
    #print(lem_text)
       
#stemming 
    stemmer= PorterStemmer()

    stem_input= nltk.word_tokenize(lem_text)
    stem_text=' '.join([stemmer.stem(word) for word in stem_input])
    #print(stem_text) 
        
#remove single letters
    preprocessed_text = ' '.join( [w for w in stem_text.split() if len(w)>1] )
    #print(preprocessed_text)
        
    return preprocessed_text
        
dataset['text']=dataset['text'].apply(processing)  


In [None]:
text = dataset['text']
text.head(10)

In [None]:
target = dataset["target"]
target.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(text,target, test_size = 0.3, random_state = 60,shuffle=True)

print(len(X_train))
print(len(X_test))

**Naive Bayes Classifier**

In [None]:
nb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train,Y_train)

test_predict = nb.predict(X_test)

train_accuracy = round(nb.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)


print("Naive Bayes Train Accuracy Score : {}% ".format(train_accuracy ))
print("Naive Bayes Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))



**Support Vector Machine Classifier**

In [None]:
sgd = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', SGDClassifier()),
               ])

sgd.fit(X_train, Y_train)

test_predict = sgd.predict(X_test)

train_accuracy = round(sgd.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)

print("SVM Train Accuracy Score : {}% ".format(train_accuracy ))
print("SVM Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))




**Decision Tree Classifier**

In [None]:
dt = Pipeline([('tfidf', TfidfVectorizer()),
                ('dt', DecisionTreeClassifier()),
               ])

dt.fit(X_train, Y_train)

test_predict = dt.predict(X_test)

train_accuracy = round(dt.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)

print("Decision Tree Train Accuracy Score : {}% ".format(train_accuracy ))
print("Decision Tree Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))


**K-Nearest Neighbour**

In [None]:
knn = Pipeline([('tfidf', TfidfVectorizer()),
                ('knn', KNeighborsClassifier(n_neighbors=5, metric='euclidean')),
               ])

knn.fit(X_train, Y_train)

test_predict = knn.predict(X_test)

train_accuracy = round(knn.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)

print("K-Nearest Neighbour Train Accuracy Score : {}% ".format(train_accuracy ))
print("K-Nearest Neighbour Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))


**Logistic Regression**

In [None]:

lr = Pipeline([('tfidf', TfidfVectorizer()),
                ('lr', LogisticRegression()),
               ])

lr.fit(X_train, Y_train)

test_predict = lr.predict(X_test)

train_accuracy = round(lr.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)

print("Logistic regression Train Accuracy Score : {}% ".format(train_accuracy ))
print("Logistic regression  Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))


**Random Forest Classifier**

In [None]:
rfc = Pipeline([('tfidf', TfidfVectorizer()),
                ('rfc', RandomForestClassifier(n_estimators=100)),
               ])

rfc.fit(X_train, Y_train)

test_predict = rfc.predict(X_test)

train_accuracy = round(rfc.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)

print("Random Forest Classifier Train Accuracy Score : {}% ".format(train_accuracy ))
print("Random Forest Classifier Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))





**Naive Bayes as the finel model**

In [None]:
test_set.head()

In [None]:
test_set['text'] = test_set['text'].apply(processing)

test_id = test_set['id']
test_text = test_set['text']
y_prdict = nb.predict(test_text)



In [None]:
#submission = pd.DataFrame(test_id)
submission = pd.DataFrame(list(zip(test_id, y_prdict)),
               columns =['id', 'target'])
submission.head(20)

In [None]:
submission.to_csv('submission.csv', index=False)