**Importing Required Libraries & Tools**

In [None]:
#Math and Visualisation Tools
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re #Regular Expression
import matplotlib.pyplot as plt 
import seaborn as sns 
import html

#NLP Tools
import nltk 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

#Modelling Tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Reading Train & Test Data**

In [None]:
train_set = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv')
test_set = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv')

# **Text Preprocessing & EDA**

In [None]:
#Creating column to differentiate test data from train data
train_set['Set'] = 'train'
test_set['Set'] = 'test'

#Combining train and test data for more efficient preprocessing
all_data = pd.concat([train_set, test_set])
all_data.reset_index(drop = True, inplace = True)

#Dropping 'UserName' and 'ScreenName' since they are non-essential
all_data = all_data.drop(['UserName','ScreenName'], axis=1)
all_data.head()

#Checking for missing data
all_data.isnull().sum()

In [None]:
#Filling in missing data
all_data['Location'] = all_data['Location'].fillna("Unknown")

#Checking again for missing data
all_data.isnull().sum()

In [None]:
#Regrouping sentiment categories for simplification
all_data['Sentiment'] = all_data.Sentiment.str.replace('Extremely Positive', 'Positive')
all_data['Sentiment'] = all_data.Sentiment.str.replace('Extremely Negative', 'Negative')
all_data.Sentiment.value_counts()

**Preliminary visualisation of sentiment distribution**

In [None]:
sentiment_count = [19592,17031,8332]
labels = 'Positive','Negative','Neutral'
plt.pie(sentiment_count, labels = labels, autopct = '%1.1f%%')
plt.title('Pie Chart of Sentiment Count')
plt.axis('equal')
plt.show()

In [None]:
#Viewing stopwords
stopwords = set(STOPWORDS)
print(stopwords)

In [None]:
#Adding additional stopwords to include COVID-19-related words
new_words = open('../input/covid19-stopwords-txt/additional stopwords.txt').read().split()
new_words = set(new_words)
print(new_words)

In [None]:
new_stopwords = stopwords.union(new_words)
print(new_stopwords)

**Visualising commonly used hashtags**

In [None]:
def hashtag(tag):
    hashtag = re.findall(r'(?<=#)\w+', tag)
    return " ".join(hashtag)

#Creating column to store tweet hashtags
all_data['Hashtag'] = all_data['OriginalTweet'].apply(lambda x:hashtag(x))

#Cleaning hashtag column

#Remove non-characters
all_data.Hashtag = all_data.Hashtag.str.replace(r"[^a-zA-Z]"," ")
all_data.Hashtag = all_data.Hashtag.str.replace("'", "")

#Lowercase
all_data.Hashtag = all_data.Hashtag.str.lower()

In [None]:
all_data.head()

In [None]:
#Tokenizing column of hashtags
tokens_list = []
for i in all_data.Hashtag:
    tokens_list.extend(i.split())
    
import collections
from collections import Counter

hashtag_counter = collections.Counter(tokens_list)

In [None]:
#Generating WordCloud to visualise commonly used hashtags

hashtag_wordcloud = WordCloud(background_color = 'white',
                             width = 800, height = 800,
                             min_font_size = 10,
                             stopwords = new_stopwords).generate(" ".join(tokens_list))

plt.figure(figsize = (8, 12))
plt.imshow(hashtag_wordcloud)
plt.axis("off")
plt.show()

**Processing Entire Tweet**

In [None]:
processedTweet = ["" for i in range(len('OriginalTweet'))]

In [None]:
#List of punctuations to remove
punctuations = """!()-![]{};:+'"\,<>./?@#$%^&*_~Â""" 

#Creating list to store processed tweets


def processing(OriginalTweet):
    
    processedTweet = ["" for i in range(len(OriginalTweet))]
    
    for i in range(0,len(OriginalTweet)):
        #Putting the tweet into a variable
        tweet = OriginalTweet[i] 
        
        #Completely removes @'s, as other peoples' usernames mean nothing
        tweet = re.sub(r"@\w+", ' ', tweet)
        
        #Removes leftover HTML elements, such as &amp;
        tweet = html.unescape(tweet)
        
        #Removes links, as links provide no data in tweet analysis in themselves
        tweet = re.sub(r'https\S+', ' ', tweet) 
        
        #Removes numbers, as well as cases like the "th" in "14th"
        tweet = re.sub(r"\d+\S+", ' ', tweet) 
        
        #Removes the punctuation defined above
        tweet = ''.join([punc for punc in tweet if not punc in punctuations]) 
        
        #Turning the tweets lowercase
        tweet = tweet.lower() 
        
        #Lemmatizes words
        tweetWord = nltk.WordNetLemmatizer()
        
        #Splits the tweet into individual words
        tweetWord = tweet.split() 
        
        
        #Checks if the words are stop words
        processedTweet[i] = "".join([word + " " for word in tweetWord if word not in new_stopwords]) 
        
    return processedTweet   

In [None]:
OriginalTweet = all_data.OriginalTweet

#Creating column to store processed tweets
all_data['ProcessedTweet'] = processing(OriginalTweet)
all_data.head()

In [None]:
AllProcessedTweet_list = []
for i in all_data.ProcessedTweet:
    AllProcessedTweet_list.extend(i.split())

word_counter = collections.Counter(AllProcessedTweet_list)

Counter(AllProcessedTweet_list).most_common(10)

In [None]:
#Generating WordCloud to visualise commonly used words

word_wordcloud = WordCloud(background_color = 'white',
                             width = 800, height = 800,
                             min_font_size = 10,
                             stopwords = new_stopwords).generate(" ".join(AllProcessedTweet_list))

plt.figure(figsize = (8, 12))
plt.imshow(word_wordcloud)
plt.axis("off")
plt.show()

# **Modeling using Logistic Regression**

**Importing Required Libraries & Tools**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
#Splitting all_data back into train and test set
train_set = all_data[all_data.Set=='train']
test_set = all_data[all_data.Set=='test']

#Removing 'Set' column
train_set.drop('Set', axis = 1, inplace = True)
test_set.drop('Set', axis = 1, inplace = True)
test_set.reset_index(drop = True, inplace = True)

#Shortening AllProcessedTweet_list for convenience
proc_list = AllProcessedTweet_list

**Vectorization of text**

In [None]:
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,2),stop_words='english').fit(all_data.ProcessedTweet)

#Splitting data into x and y datasets

x_train = train_set.ProcessedTweet
y_train = train_set.Sentiment

x_test = test_set.ProcessedTweet
y_test = test_set.Sentiment

# Split to train and validation
x_trainset, x_valset, y_trainset, y_valset = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

x_train_vector = vectorizer.transform(x_trainset)
x_val_vector = vectorizer.transform(x_valset)

In [None]:
#Checking to see if shapes tally
x_trainset.shape, y_trainset.shape, x_valset.shape, y_valset.shape

In [None]:
feature_weight = x_train_vector.sum(axis=0).tolist()[0]
features = pd.DataFrame(feature_weight)
features.index =  list(vectorizer.get_feature_names())
features.sort_values(by=[0],ascending=False).head(30).plot.barh(figsize=(20,10))
plt.xlabel('Weight')

In [None]:
#Using cross-validation
cross_val_score(LogisticRegression(random_state=42), x_train_vector, y_trainset, cv=10, verbose=1, n_jobs=-1).mean()

In [None]:
#Using standard validation
model = LogisticRegression(random_state = 42).fit(x_train_vector, y_trainset)
print(classification_report(y_valset, model.predict(x_val_vector)))

**Obtaining parameters for best model**

In [None]:
best_model = LogisticRegression(C=1, penalty='l1', random_state=42, solver='saga')
best_model.fit(x_train_vector, y_trainset)

In [None]:
# The best model performance on validation dataset
print(classification_report(y_valset, best_model.predict(x_val_vector)))

**Prediction on test data**

In [None]:
x_test_vector = vectorizer.transform(x_test)

In [None]:
#Determining accuracy using accuracy_score
y_true = y_test
y_pred = best_model.predict(x_test_vector)
print(accuracy_score(y_true, y_pred, normalize=True, sample_weight=None))

**Visualising prediction using heat map**

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_true, y_pred),annot=True, fmt='d', annot_kws={'size':17}, cmap='Reds')
plt.ylabel('True')
plt.xlabel('Predicted')

**Thank you for your time :)**