#Upload Data Set

In [1]:
!pip install kaggle



In [2]:
! mkdir ~/.kaggle

In [3]:
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d datatattle/covid-19-nlp-text-classification

Downloading covid-19-nlp-text-classification.zip to /content
100% 4.38M/4.38M [00:00<00:00, 23.1MB/s]



In [6]:
!unzip \*.zip && rm *.zip

Archive:  covid-19-nlp-text-classification.zip
  inflating: Corona_NLP_test.csv     
  inflating: Corona_NLP_train.csv    


###Adding Required Libraries

In [7]:
import pandas as pd

### Load Data Set

In [8]:
train = pd.read_csv("Corona_NLP_train.csv",encoding='ISO-8859-1')
test = pd.read_csv("Corona_NLP_test.csv",encoding='ISO-8859-1')

In [9]:
train.info() #check

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


#Normalization 3

1. Deleting Duplicate
2. Deleting Unnecessary Columns
3. Normalizing Tweet
*   Remove all mentions and URLs
*   Remove all \n and \r
*   Remove all special characters (&,$,£,¥)
*   Remove end hashtags at the and keep middle hashtags without # symbol
*   Remove multiple spaces
4.   Lowercase
5. 3 and 5 Sentiment Data sets


### Adding Required Libraries

In [10]:
import re,string

###Deleting Duplicate Tweets

In [11]:
train.drop_duplicates(subset="OriginalTweet",inplace=True)
test.drop_duplicates(subset="OriginalTweet",inplace=True)

### Deleting Unnecessary Columns

In [12]:
train = train[["OriginalTweet","Sentiment"]]
test = test[["OriginalTweet","Sentiment"]]

### Normalization

#### Remove Mentions and URLS

In [13]:
def remove_mention_URL(tweet):
  return re.sub(r"(?:\@|https?\://)\S+", "", tweet)

In [15]:
tweet = "This is a test tweet @user https://www.youtube.com @user2 and of test tweet @user3"
print("Tweet Before\n",tweet)
tweet = remove_mention_URL(tweet)
print("Tweet After\n",tweet)

Tweet Before
 This is a test tweet @user https://www.youtube.com @user2 and of test tweet @user3
Tweet After
 This is a test tweet    and of test tweet 


#### Remove New Lines 

In [18]:
def remove_newline(tweet):
  return tweet.replace('\r', ' ').replace('\n', ' ')

In [19]:
tweet = "This\nis\ra\n\rtest\ntweet\r"
print("Tweet Before\n",tweet)
tweet = remove_newline(tweet)
print("Tweet After\n",tweet)

Tweet Before
 This
isa
test
tweet
Tweet After
 This is a  test tweet 


#### Remove All Special Characters (&,$,£,¥)

In [20]:
def remove_char(tweet):
  return tweet.replace('$',' ').replace('£',' ').replace('&',' ').replace('¥',' ')

In [21]:
tweet = "This is a $Dolar a £Euro ¥Yen and& end of $tw££t&"
print("Tweet Before\n",tweet)
tweet = remove_char(tweet)
print("Tweet After\n",tweet)

Tweet Before
 This is a $Dolar a £Euro ¥Yen and& end of $tw££t&
Tweet After
 This is a  Dolar a  Euro  Yen and  end of  tw  t 


#### Remove Hashtag

In [22]:
def remove_hashtag(tweet):
  new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
  new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
  return new_tweet2

In [24]:
tweet = "#This is #a tweet #Hashtag #Tweet"
print("Tweet Before\n",tweet)
tweet = remove_hashtag(tweet)
print("Tweet After\n",tweet)

Tweet Before
 #This is #a tweet #Hashtag #Tweet
Tweet After
  This is a tweet


#### Remove Multiple Spaces

In [25]:
def remove_mult_spaces(tweet):
    return re.sub("\s\s+" , " ", tweet)

In [26]:
tweet = "This    is a   test  tweet     "
print("Tweet Before\n",tweet)
tweet = remove_mult_spaces(tweet)
print("Tweet After\n",tweet)

Tweet Before
 This    is a   test  tweet     
Tweet After
 This is a test tweet 


####Making it all one function

In [29]:
def normalize(tweet):
  tweet = remove_mention_URL(tweet)
  tweet = remove_newline(tweet)
  tweet = remove_char(tweet)
  tweet = remove_hashtag(tweet)
  tweet = remove_mult_spaces(tweet)
  tweet = tweet.lower()
  return tweet

In [32]:
tweet = "@aydcsr\r #This\n£is $a  TEST\n #Tweet @user123 #COVID"
print("Tweet Before\n",tweet)
tweet = normalize(tweet)
print("Tweet After\n",tweet)

Tweet Before
 @aydcsr #This
£is $a  TEST
 #Tweet @user123 #COVID
Tweet After
  this is a test


####Train

In [33]:
train.head(300)

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
295,Oregon Gov. Kate Brown on Monday afternoon ord...,Negative
296,#Coronavirus - #Europe calls for calm as food ...,Negative
297,When even Amazon is struggling to deliver food...,Extremely Negative
298,Anthony Fauci said that while some people may ...,Negative


In [34]:
for i in range(len(train)):
  train.iloc[i]["OriginalTweet"] = normalize(train.iloc[i]["OriginalTweet"])

In [37]:
train.head(300)

Unnamed: 0,OriginalTweet,Sentiment
0,and and,Neutral
1,advice talk to your neighbours family to excha...,Positive
2,coronavirus australia: woolworths to give elde...,Positive
3,my food stock is not the only one which is emp...,Positive
4,"me, ready to go at supermarket during the covi...",Extremely Negative
...,...,...
295,oregon gov. kate brown on monday afternoon ord...,Negative
296,coronavirus - europe calls for calm as food s...,Negative
297,when even amazon is struggling to deliver food...,Extremely Negative
298,anthony fauci said that while some people may ...,Negative


#### Test

In [38]:
test.head(300)

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...
295,The Coronavirus virus has people buying up all...,Extremely Positive
296,"retail notes growth in demand for groceries, c...",Positive
297,#Covid_19 stole Bernie's platform- it lowered ...,Positive
298,I have PTSD from surviving Hurricane Katrina.\...,Positive


In [39]:
for i in range(len(test)):
  test.iloc[i]["OriginalTweet"] = normalize(test.iloc[i]["OriginalTweet"])

In [40]:
test.head(300)

Unnamed: 0,OriginalTweet,Sentiment
0,trending: new yorkers encounter empty supermar...,Extremely Negative
1,when i couldn't find hand sanitizer at fred me...,Positive
2,find out how you can protect yourself and love...,Extremely Positive
3,panic buying hits newyork city as anxious sho...,Negative
4,toiletpaper dunnypaper coronavirus coronaviru...,Neutral
...,...,...
295,the coronavirus virus has people buying up all...,Extremely Positive
296,"retail notes growth in demand for groceries, c...",Positive
297,covid 19 stole bernie's platform- it lowered ...,Positive
298,i have ptsd from surviving hurricane katrina. ...,Positive


### Creating 3 Sentiment Dataset

In [41]:
train_3s = train.copy()
test_3s = test.copy()

In [42]:
five2three = {"Extremely Positive":"Positive","Positive":"Positive","Neutral":"Neutral","Negative":"Negative","Extremely Negative":"Negative"}
train_3s["Sentiment"] = [five2three[x] for x in train_3s["Sentiment"]]
test_3s["Sentiment"] = [five2three[x] for x in test_3s["Sentiment"]]

In [43]:
train_3s["Sentiment"].value_counts()

Positive    18046
Negative    15398
Neutral      7713
Name: Sentiment, dtype: int64

In [44]:
test_3s["Sentiment"].value_counts()

Negative    1633
Positive    1546
Neutral      619
Name: Sentiment, dtype: int64

#Save Data Sets

In [45]:
train.to_csv("COVID19_train_N3_S5.csv", index = False)
test.to_csv("COVID19_test_N3_S5.csv", index = False)
train_3s.to_csv("COVID19_train_N3_S3.csv",index = False)
test_3s.to_csv("COVID19_test_N3_S3.csv", index = False)