In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
import unidecode
import nltk

from tensorflow import keras
from keras.preprocessing.text import text_to_word_sequence
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


pd.options.display.max_rows = 300
pd.options.display.max_columns = 300

# Read Dataset using pandas read_csv 

In [None]:
dataset = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv')
testset = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv')

# About Dataset

Lets now see how our dataset looks like and identify the important features

In [None]:
dataset.head()

In [None]:
dataset.shape

Which dataset.shape describe how our dataset is. (Number of rows, Number of columns). WHich is there are 41157 data rows and 6 columns of features

In [None]:
dataset.info()

**Unique Values Of Sentiment Feature which shows our labels to classify**

In [None]:
dataset['Sentiment'].unique()

In [None]:
dataset.Location.nunique()

Seems like there are 12220 different locations.

**Now lets see about null values**

In [None]:
dataset.isnull().sum()

There are 8590 null values in Location. Since location is categorical we can refill the null values with the most used feature value in that column. But since 8590 is a huge amount and if we fill those values our dataset will be highly biased towards that. So I will drop Loacation feature from the data base  

In [None]:
dataset = dataset.drop(columns='Location')

In [None]:
dataset.head()

Let's check what we can identify from ScreenName and UserName feature

In [None]:
dataset.ScreenName.nunique()

In [None]:
dataset.UserName.nunique()

Both ScreenName and UserName have 41157 unique values. 

Lets take a look at Original Tweet Feature

# Lets Convert Sentiments in to factorial values

In [None]:
dataset['label'] = dataset.Sentiment.factorize()[0]

In [None]:
dataset.head()

This will rename the Sentiments into numbers
* 0 - Neutral
* 1 - Positive
* 2 - Extremely Negative
* 3 - Negative 
* 4 - Extremely Positive



# Data Visualaization

In [None]:
target_category = dataset['Sentiment'].unique()
target_category

In [None]:
dataset.groupby('Sentiment').label.count().sort_values(ascending = False)

In [None]:
dataset.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [None]:
dataset.Sentiment.value_counts().plot(kind='pie', y='label',figsize=(10,8),autopct='%1.1f%%')
plt.show()

In [None]:
tweets = dataset.OriginalTweet
tweets.head(10)

This Tweet set has lot of special charaters and unwanted stuff for data training. So lets do some data preprocessing

# Data Preprocessing

In [None]:
def processing(text): 
    
    
#tokenization using keras text to word sequence tokenizer
    tokenized_text = text_to_word_sequence(text)
   
        
#stop word removal using remove_stopwords from gensim
    text = ' '.join(tokenized_text)
    text = text.replace("'", "")
    stop_word_removed_text = remove_stopwords(text)
    
        
#remove numbers
    number_removed_text = new_string = ''.join(filter(lambda x: not x.isdigit(), stop_word_removed_text))
   
        
#remove extra white spaces
    extra_whitespace_removed = word_tokenize(number_removed_text)
    extra_whitespace_removed = number_removed_text.split()
    
        
    extra_whitespace_removed = ' '.join(extra_whitespace_removed)
    
        
#Convert Accented Characters(û -> u)
    accented_removed_text = unidecode.unidecode(extra_whitespace_removed)
  
        
#lemmatization
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lem_input = nltk.word_tokenize(accented_removed_text)
    lem_text= ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in lem_input])
    
       
#stemming 
    stemmer= PorterStemmer()

    stem_input= nltk.word_tokenize(lem_text)
    stem_text=' '.join([stemmer.stem(word) for word in stem_input])
   
        
#remove single letters
    preprocessed_text = ' '.join( [w for w in stem_text.split() if len(w)>1] )
    
        
    return preprocessed_text
        


In [None]:
dataset['OriginalTweet']=dataset['OriginalTweet'].apply(processing)  

In [None]:
tweets = dataset['OriginalTweet']
tweets.head()

In [None]:
sentiment = dataset.Sentiment

# Split test/train sets

Lets now split the dataset into train and test sets baesd on 0.3 ratio which is 70% of data for training purpose and 30% of data for testing purpose. You can use 80%,20% ratio as well.  

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(tweets,sentiment, test_size = 0.3, random_state = 60,shuffle=True)

print(len(X_train))
print(len(X_test))

# SGD Classifier

Let's train our model with SGD Classifier. For this I have used a pipeline with Tf-Idf Vectorizor which basically do the vectorization part. It will convert word in our preprocessed datset into a matrix of TF-IDF features. Based on those features the model will be trained.

In [None]:
sgd = Pipeline([('tfidf', TfidfVectorizer()),
                ('sgd', SGDClassifier()),
               ])

sgd.fit(X_train, Y_train)

test_predict = sgd.predict(X_test)

train_accuracy = round(sgd.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)

print("SGD Train Accuracy Score : {}% ".format(train_accuracy ))
print("SGD Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))

# Test Set

In [None]:
testset.head()

# Test the data set with the sgd model

In [None]:
testset['OriginalTweet'] = testset['OriginalTweet'].apply(processing)


tweet = testset['OriginalTweet']
y_predict = sgd.predict(tweet)


In [None]:
test_sentiments = testset['Sentiment']

# Check Accuracy

In [None]:
test_accuracy =round(accuracy_score(test_sentiments, y_predict)*100)
print("SGD Classifier Test Accuracy Score  : {}% ".format(test_accuracy ))

**Since this is for the begginers I have only used SGD Classifier. I'll soon come with a LSTM model for this one. Hope you learned something. Please do upvote if you learned anything and leave a feedback. Good Luck!**